mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2024-12-25 00:15:18 +07:00
c2e415fe75
Several source files have been taken from OpenSSL. In some of them a comment that "permission to use under GPL terms is granted" was included below a contradictory license statement. In several cases, there was no indication that the license of the code was compatible with the GPLv2. This change clarifies the licensing for all of these files. I've confirmed with the author (Andy Polyakov) that a) he has licensed the files with the GPLv2 comment under that license and b) that he's also happy to license the other files under GPLv2 too. In one case, the file is already contained in his CRYPTOGAMS bundle, which has a GPLv2 option, and so no special measures are needed. In all cases, the license status of code has been clarified by making the GPLv2 license prominent. The .S files have been regenerated from the updated .pl files. This is a comment-only change. No code is changed. Signed-off-by: Adam Langley <agl@chromium.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
508 lines
13 KiB
ArmAsm
508 lines
13 KiB
ArmAsm
#define __ARM_ARCH__ __LINUX_ARM_ARCH__
|
|
@ SPDX-License-Identifier: GPL-2.0
|
|
|
|
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
|
|
@ has relicensed it under the GPLv2. Therefore this program is free software;
|
|
@ you can redistribute it and/or modify it under the terms of the GNU General
|
|
@ Public License version 2 as published by the Free Software Foundation.
|
|
@
|
|
@ The original headers, including the original license headers, are
|
|
@ included below for completeness.
|
|
|
|
@ ====================================================================
|
|
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
|
@ project. The module is, however, dual licensed under OpenSSL and
|
|
@ CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
@ details see http://www.openssl.org/~appro/cryptogams/.
|
|
@ ====================================================================
|
|
|
|
@ sha1_block procedure for ARMv4.
|
|
@
|
|
@ January 2007.
|
|
|
|
@ Size/performance trade-off
|
|
@ ====================================================================
|
|
@ impl size in bytes comp cycles[*] measured performance
|
|
@ ====================================================================
|
|
@ thumb 304 3212 4420
|
|
@ armv4-small 392/+29% 1958/+64% 2250/+96%
|
|
@ armv4-compact 740/+89% 1552/+26% 1840/+22%
|
|
@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
|
|
@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
|
|
@ ====================================================================
|
|
@ thumb = same as 'small' but in Thumb instructions[**] and
|
|
@ with recurring code in two private functions;
|
|
@ small = detached Xload/update, loops are folded;
|
|
@ compact = detached Xload/update, 5x unroll;
|
|
@ large = interleaved Xload/update, 5x unroll;
|
|
@ full unroll = interleaved Xload/update, full unroll, estimated[!];
|
|
@
|
|
@ [*] Manually counted instructions in "grand" loop body. Measured
|
|
@ performance is affected by prologue and epilogue overhead,
|
|
@ i-cache availability, branch penalties, etc.
|
|
@ [**] While each Thumb instruction is twice smaller, they are not as
|
|
@ diverse as ARM ones: e.g., there are only two arithmetic
|
|
@ instructions with 3 arguments, no [fixed] rotate, addressing
|
|
@ modes are limited. As result it takes more instructions to do
|
|
@ the same job in Thumb, therefore the code is never twice as
|
|
@ small and always slower.
|
|
@ [***] which is also ~35% better than compiler generated code. Dual-
|
|
@ issue Cortex A8 core was measured to process input block in
|
|
@ ~990 cycles.
|
|
|
|
@ August 2010.
|
|
@
|
|
@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
|
|
@ Cortex A8 core and in absolute terms ~870 cycles per input block
|
|
@ [or 13.6 cycles per byte].
|
|
|
|
@ February 2011.
|
|
@
|
|
@ Profiler-assisted and platform-specific optimization resulted in 10%
|
|
@ improvement on Cortex A8 core and 12.2 cycles per byte.
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
.text
|
|
|
|
.align 2
|
|
ENTRY(sha1_block_data_order)
|
|
stmdb sp!,{r4-r12,lr}
|
|
add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
|
|
ldmia r0,{r3,r4,r5,r6,r7}
|
|
.Lloop:
|
|
ldr r8,.LK_00_19
|
|
mov r14,sp
|
|
sub sp,sp,#15*4
|
|
mov r5,r5,ror#30
|
|
mov r6,r6,ror#30
|
|
mov r7,r7,ror#30 @ [6]
|
|
.L_00_15:
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r7,r8,r7,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r7,r8,r7,ror#2 @ E+=K_00_19
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r4,r10,ror#2
|
|
add r7,r7,r9 @ E+=X[i]
|
|
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r7,r7,r10 @ E+=F_00_19(B,C,D)
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r6,r8,r6,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r4,r5 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r6,r8,r6,ror#2 @ E+=K_00_19
|
|
eor r10,r4,r5 @ F_xx_xx
|
|
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r3,r10,ror#2
|
|
add r6,r6,r9 @ E+=X[i]
|
|
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r6,r6,r10 @ E+=F_00_19(B,C,D)
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r5,r8,r5,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r3,r4 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r5,r8,r5,ror#2 @ E+=K_00_19
|
|
eor r10,r3,r4 @ F_xx_xx
|
|
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r7,r10,ror#2
|
|
add r5,r5,r9 @ E+=X[i]
|
|
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r5,r5,r10 @ E+=F_00_19(B,C,D)
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r4,r8,r4,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r7,r3 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r4,r8,r4,ror#2 @ E+=K_00_19
|
|
eor r10,r7,r3 @ F_xx_xx
|
|
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r6,r10,ror#2
|
|
add r4,r4,r9 @ E+=X[i]
|
|
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r4,r4,r10 @ E+=F_00_19(B,C,D)
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r3,r8,r3,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r6,r7 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r3,r8,r3,ror#2 @ E+=K_00_19
|
|
eor r10,r6,r7 @ F_xx_xx
|
|
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r5,r10,ror#2
|
|
add r3,r3,r9 @ E+=X[i]
|
|
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r3,r3,r10 @ E+=F_00_19(B,C,D)
|
|
cmp r14,sp
|
|
bne .L_00_15 @ [((11+4)*5+2)*3]
|
|
sub sp,sp,#25*4
|
|
#if __ARM_ARCH__<7
|
|
ldrb r10,[r1,#2]
|
|
ldrb r9,[r1,#3]
|
|
ldrb r11,[r1,#1]
|
|
add r7,r8,r7,ror#2 @ E+=K_00_19
|
|
ldrb r12,[r1],#4
|
|
orr r9,r9,r10,lsl#8
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
orr r9,r9,r11,lsl#16
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
orr r9,r9,r12,lsl#24
|
|
#else
|
|
ldr r9,[r1],#4 @ handles unaligned
|
|
add r7,r8,r7,ror#2 @ E+=K_00_19
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
#ifdef __ARMEL__
|
|
rev r9,r9 @ byte swap
|
|
#endif
|
|
#endif
|
|
and r10,r4,r10,ror#2
|
|
add r7,r7,r9 @ E+=X[i]
|
|
eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
|
|
str r9,[r14,#-4]!
|
|
add r7,r7,r10 @ E+=F_00_19(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r6,r8,r6,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r4,r5 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r3,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r6,r6,r9 @ E+=X[i]
|
|
eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
|
|
add r6,r6,r10 @ E+=F_00_19(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r5,r8,r5,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r3,r4 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r7,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r5,r5,r9 @ E+=X[i]
|
|
eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
|
|
add r5,r5,r10 @ E+=F_00_19(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r4,r8,r4,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r7,r3 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r6,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r4,r4,r9 @ E+=X[i]
|
|
eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
|
|
add r4,r4,r10 @ E+=F_00_19(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r3,r8,r3,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r6,r7 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r5,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r3,r3,r9 @ E+=X[i]
|
|
eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
|
|
add r3,r3,r10 @ E+=F_00_19(B,C,D)
|
|
|
|
ldr r8,.LK_20_39 @ [+15+16*4]
|
|
cmn sp,#0 @ [+3], clear carry to denote 20_39
|
|
.L_20_39_or_60_79:
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r7,r8,r7,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
eor r10,r4,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r7,r7,r9 @ E+=X[i]
|
|
add r7,r7,r10 @ E+=F_20_39(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r6,r8,r6,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r4,r5 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
eor r10,r3,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r6,r6,r9 @ E+=X[i]
|
|
add r6,r6,r10 @ E+=F_20_39(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r5,r8,r5,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r3,r4 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
eor r10,r7,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r5,r5,r9 @ E+=X[i]
|
|
add r5,r5,r10 @ E+=F_20_39(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r4,r8,r4,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r7,r3 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
eor r10,r6,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r4,r4,r9 @ E+=X[i]
|
|
add r4,r4,r10 @ E+=F_20_39(B,C,D)
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r3,r8,r3,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r6,r7 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
eor r10,r5,r10,ror#2 @ F_xx_xx
|
|
@ F_xx_xx
|
|
add r3,r3,r9 @ E+=X[i]
|
|
add r3,r3,r10 @ E+=F_20_39(B,C,D)
|
|
ARM( teq r14,sp ) @ preserve carry
|
|
THUMB( mov r11,sp )
|
|
THUMB( teq r14,r11 ) @ preserve carry
|
|
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
|
|
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
|
|
|
|
ldr r8,.LK_40_59
|
|
sub sp,sp,#20*4 @ [+2]
|
|
.L_40_59:
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r7,r8,r7,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r5,r6 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r7,r7,r3,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r4,r10,ror#2 @ F_xx_xx
|
|
and r11,r5,r6 @ F_xx_xx
|
|
add r7,r7,r9 @ E+=X[i]
|
|
add r7,r7,r10 @ E+=F_40_59(B,C,D)
|
|
add r7,r7,r11,ror#2
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r6,r8,r6,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r4,r5 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r6,r6,r7,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r3,r10,ror#2 @ F_xx_xx
|
|
and r11,r4,r5 @ F_xx_xx
|
|
add r6,r6,r9 @ E+=X[i]
|
|
add r6,r6,r10 @ E+=F_40_59(B,C,D)
|
|
add r6,r6,r11,ror#2
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r5,r8,r5,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r3,r4 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r5,r5,r6,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r7,r10,ror#2 @ F_xx_xx
|
|
and r11,r3,r4 @ F_xx_xx
|
|
add r5,r5,r9 @ E+=X[i]
|
|
add r5,r5,r10 @ E+=F_40_59(B,C,D)
|
|
add r5,r5,r11,ror#2
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r4,r8,r4,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r7,r3 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r4,r4,r5,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r6,r10,ror#2 @ F_xx_xx
|
|
and r11,r7,r3 @ F_xx_xx
|
|
add r4,r4,r9 @ E+=X[i]
|
|
add r4,r4,r10 @ E+=F_40_59(B,C,D)
|
|
add r4,r4,r11,ror#2
|
|
ldr r9,[r14,#15*4]
|
|
ldr r10,[r14,#13*4]
|
|
ldr r11,[r14,#7*4]
|
|
add r3,r8,r3,ror#2 @ E+=K_xx_xx
|
|
ldr r12,[r14,#2*4]
|
|
eor r9,r9,r10
|
|
eor r11,r11,r12 @ 1 cycle stall
|
|
eor r10,r6,r7 @ F_xx_xx
|
|
mov r9,r9,ror#31
|
|
add r3,r3,r4,ror#27 @ E+=ROR(A,27)
|
|
eor r9,r9,r11,ror#31
|
|
str r9,[r14,#-4]!
|
|
and r10,r5,r10,ror#2 @ F_xx_xx
|
|
and r11,r6,r7 @ F_xx_xx
|
|
add r3,r3,r9 @ E+=X[i]
|
|
add r3,r3,r10 @ E+=F_40_59(B,C,D)
|
|
add r3,r3,r11,ror#2
|
|
cmp r14,sp
|
|
bne .L_40_59 @ [+((12+5)*5+2)*4]
|
|
|
|
ldr r8,.LK_60_79
|
|
sub sp,sp,#20*4
|
|
cmp sp,#0 @ set carry to denote 60_79
|
|
b .L_20_39_or_60_79 @ [+4], spare 300 bytes
|
|
.L_done:
|
|
add sp,sp,#80*4 @ "deallocate" stack frame
|
|
ldmia r0,{r8,r9,r10,r11,r12}
|
|
add r3,r8,r3
|
|
add r4,r9,r4
|
|
add r5,r10,r5,ror#2
|
|
add r6,r11,r6,ror#2
|
|
add r7,r12,r7,ror#2
|
|
stmia r0,{r3,r4,r5,r6,r7}
|
|
teq r1,r2
|
|
bne .Lloop @ [+18], total 1307
|
|
|
|
ldmia sp!,{r4-r12,pc}
|
|
.align 2
|
|
.LK_00_19: .word 0x5a827999
|
|
.LK_20_39: .word 0x6ed9eba1
|
|
.LK_40_59: .word 0x8f1bbcdc
|
|
.LK_60_79: .word 0xca62c1d6
|
|
ENDPROC(sha1_block_data_order)
|
|
.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
|
|
.align 2
|