linux_dsm_epyc7002/arch/x86/crypto/salsa20-i586-asm_32.S
Eric Biggers c9a3ff8f22 crypto: x86/salsa20 - cleanup and convert to skcipher API
Convert salsa20-asm from the deprecated "blkcipher" API to the
"skcipher" API, in the process fixing it up to use the generic helpers.
This allows removing the salsa20_keysetup() and salsa20_ivsetup()
assembly functions, which aren't performance critical; the C versions do
just fine.

This also fixes the same bug that salsa20-generic had, where the state
array was being maintained directly in the transform context rather than
on the stack or in the request context.  Thus, if multiple threads used
the same Salsa20 transform concurrently they produced the wrong results.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2018-01-12 23:03:43 +11:00

939 lines
16 KiB
ArmAsm

# Derived from:
# salsa20_pm.s version 20051229
# D. J. Bernstein
# Public domain.
#include <linux/linkage.h>
.text
# enter salsa20_encrypt_bytes
ENTRY(salsa20_encrypt_bytes)
mov %esp,%eax
and $31,%eax
add $256,%eax
sub %eax,%esp
# eax_stack = eax
movl %eax,80(%esp)
# ebx_stack = ebx
movl %ebx,84(%esp)
# esi_stack = esi
movl %esi,88(%esp)
# edi_stack = edi
movl %edi,92(%esp)
# ebp_stack = ebp
movl %ebp,96(%esp)
# x = arg1
movl 4(%esp,%eax),%edx
# m = arg2
movl 8(%esp,%eax),%esi
# out = arg3
movl 12(%esp,%eax),%edi
# bytes = arg4
movl 16(%esp,%eax),%ebx
# bytes -= 0
sub $0,%ebx
# goto done if unsigned<=
jbe ._done
._start:
# in0 = *(uint32 *) (x + 0)
movl 0(%edx),%eax
# in1 = *(uint32 *) (x + 4)
movl 4(%edx),%ecx
# in2 = *(uint32 *) (x + 8)
movl 8(%edx),%ebp
# j0 = in0
movl %eax,164(%esp)
# in3 = *(uint32 *) (x + 12)
movl 12(%edx),%eax
# j1 = in1
movl %ecx,168(%esp)
# in4 = *(uint32 *) (x + 16)
movl 16(%edx),%ecx
# j2 = in2
movl %ebp,172(%esp)
# in5 = *(uint32 *) (x + 20)
movl 20(%edx),%ebp
# j3 = in3
movl %eax,176(%esp)
# in6 = *(uint32 *) (x + 24)
movl 24(%edx),%eax
# j4 = in4
movl %ecx,180(%esp)
# in7 = *(uint32 *) (x + 28)
movl 28(%edx),%ecx
# j5 = in5
movl %ebp,184(%esp)
# in8 = *(uint32 *) (x + 32)
movl 32(%edx),%ebp
# j6 = in6
movl %eax,188(%esp)
# in9 = *(uint32 *) (x + 36)
movl 36(%edx),%eax
# j7 = in7
movl %ecx,192(%esp)
# in10 = *(uint32 *) (x + 40)
movl 40(%edx),%ecx
# j8 = in8
movl %ebp,196(%esp)
# in11 = *(uint32 *) (x + 44)
movl 44(%edx),%ebp
# j9 = in9
movl %eax,200(%esp)
# in12 = *(uint32 *) (x + 48)
movl 48(%edx),%eax
# j10 = in10
movl %ecx,204(%esp)
# in13 = *(uint32 *) (x + 52)
movl 52(%edx),%ecx
# j11 = in11
movl %ebp,208(%esp)
# in14 = *(uint32 *) (x + 56)
movl 56(%edx),%ebp
# j12 = in12
movl %eax,212(%esp)
# in15 = *(uint32 *) (x + 60)
movl 60(%edx),%eax
# j13 = in13
movl %ecx,216(%esp)
# j14 = in14
movl %ebp,220(%esp)
# j15 = in15
movl %eax,224(%esp)
# x_backup = x
movl %edx,64(%esp)
._bytesatleast1:
# bytes - 64
cmp $64,%ebx
# goto nocopy if unsigned>=
jae ._nocopy
# ctarget = out
movl %edi,228(%esp)
# out = &tmp
leal 0(%esp),%edi
# i = bytes
mov %ebx,%ecx
# while (i) { *out++ = *m++; --i }
rep movsb
# out = &tmp
leal 0(%esp),%edi
# m = &tmp
leal 0(%esp),%esi
._nocopy:
# out_backup = out
movl %edi,72(%esp)
# m_backup = m
movl %esi,68(%esp)
# bytes_backup = bytes
movl %ebx,76(%esp)
# in0 = j0
movl 164(%esp),%eax
# in1 = j1
movl 168(%esp),%ecx
# in2 = j2
movl 172(%esp),%edx
# in3 = j3
movl 176(%esp),%ebx
# x0 = in0
movl %eax,100(%esp)
# x1 = in1
movl %ecx,104(%esp)
# x2 = in2
movl %edx,108(%esp)
# x3 = in3
movl %ebx,112(%esp)
# in4 = j4
movl 180(%esp),%eax
# in5 = j5
movl 184(%esp),%ecx
# in6 = j6
movl 188(%esp),%edx
# in7 = j7
movl 192(%esp),%ebx
# x4 = in4
movl %eax,116(%esp)
# x5 = in5
movl %ecx,120(%esp)
# x6 = in6
movl %edx,124(%esp)
# x7 = in7
movl %ebx,128(%esp)
# in8 = j8
movl 196(%esp),%eax
# in9 = j9
movl 200(%esp),%ecx
# in10 = j10
movl 204(%esp),%edx
# in11 = j11
movl 208(%esp),%ebx
# x8 = in8
movl %eax,132(%esp)
# x9 = in9
movl %ecx,136(%esp)
# x10 = in10
movl %edx,140(%esp)
# x11 = in11
movl %ebx,144(%esp)
# in12 = j12
movl 212(%esp),%eax
# in13 = j13
movl 216(%esp),%ecx
# in14 = j14
movl 220(%esp),%edx
# in15 = j15
movl 224(%esp),%ebx
# x12 = in12
movl %eax,148(%esp)
# x13 = in13
movl %ecx,152(%esp)
# x14 = in14
movl %edx,156(%esp)
# x15 = in15
movl %ebx,160(%esp)
# i = 20
mov $20,%ebp
# p = x0
movl 100(%esp),%eax
# s = x5
movl 120(%esp),%ecx
# t = x10
movl 140(%esp),%edx
# w = x15
movl 160(%esp),%ebx
._mainloop:
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x12
addl 148(%esp),%eax
# x5 = s
movl %ecx,120(%esp)
# t += x6
addl 124(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x1
movl 104(%esp),%esi
# r += s
add %ecx,%esi
# v = x11
movl 144(%esp),%edi
# v += w
add %ebx,%edi
# p <<<= 7
rol $7,%eax
# p ^= x4
xorl 116(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x14
xorl 156(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x9
xorl 136(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x3
xorl 112(%esp),%edi
# x4 = p
movl %eax,116(%esp)
# x14 = t
movl %edx,156(%esp)
# p += x0
addl 100(%esp),%eax
# x9 = r
movl %esi,136(%esp)
# t += x10
addl 140(%esp),%edx
# x3 = v
movl %edi,112(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x8
xorl 132(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x2
xorl 108(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x13
xorl 152(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x7
xorl 128(%esp),%ebx
# x8 = p
movl %eax,132(%esp)
# x2 = t
movl %edx,108(%esp)
# p += x4
addl 116(%esp),%eax
# x13 = s
movl %ecx,152(%esp)
# t += x14
addl 156(%esp),%edx
# x7 = w
movl %ebx,128(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x12
xorl 148(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x6
xorl 124(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x1
xorl 104(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x11
xorl 144(%esp),%edi
# x12 = p
movl %eax,148(%esp)
# x6 = t
movl %edx,124(%esp)
# p += x8
addl 132(%esp),%eax
# x1 = r
movl %esi,104(%esp)
# t += x2
addl 108(%esp),%edx
# x11 = v
movl %edi,144(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x3
addl 112(%esp),%eax
# p <<<= 7
rol $7,%eax
# x5 = s
movl %ecx,120(%esp)
# t += x9
addl 136(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x4
movl 116(%esp),%esi
# r += s
add %ecx,%esi
# v = x14
movl 156(%esp),%edi
# v += w
add %ebx,%edi
# p ^= x1
xorl 104(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x11
xorl 144(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x6
xorl 124(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x12
xorl 148(%esp),%edi
# x1 = p
movl %eax,104(%esp)
# x11 = t
movl %edx,144(%esp)
# p += x0
addl 100(%esp),%eax
# x6 = r
movl %esi,124(%esp)
# t += x10
addl 140(%esp),%edx
# x12 = v
movl %edi,148(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x2
xorl 108(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x8
xorl 132(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x7
xorl 128(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x13
xorl 152(%esp),%ebx
# x2 = p
movl %eax,108(%esp)
# x8 = t
movl %edx,132(%esp)
# p += x1
addl 104(%esp),%eax
# x7 = s
movl %ecx,128(%esp)
# t += x11
addl 144(%esp),%edx
# x13 = w
movl %ebx,152(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x3
xorl 112(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x9
xorl 136(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x4
xorl 116(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x14
xorl 156(%esp),%edi
# x3 = p
movl %eax,112(%esp)
# x9 = t
movl %edx,136(%esp)
# p += x2
addl 108(%esp),%eax
# x4 = r
movl %esi,116(%esp)
# t += x8
addl 132(%esp),%edx
# x14 = v
movl %edi,156(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x12
addl 148(%esp),%eax
# x5 = s
movl %ecx,120(%esp)
# t += x6
addl 124(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x1
movl 104(%esp),%esi
# r += s
add %ecx,%esi
# v = x11
movl 144(%esp),%edi
# v += w
add %ebx,%edi
# p <<<= 7
rol $7,%eax
# p ^= x4
xorl 116(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x14
xorl 156(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x9
xorl 136(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x3
xorl 112(%esp),%edi
# x4 = p
movl %eax,116(%esp)
# x14 = t
movl %edx,156(%esp)
# p += x0
addl 100(%esp),%eax
# x9 = r
movl %esi,136(%esp)
# t += x10
addl 140(%esp),%edx
# x3 = v
movl %edi,112(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x8
xorl 132(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x2
xorl 108(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x13
xorl 152(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x7
xorl 128(%esp),%ebx
# x8 = p
movl %eax,132(%esp)
# x2 = t
movl %edx,108(%esp)
# p += x4
addl 116(%esp),%eax
# x13 = s
movl %ecx,152(%esp)
# t += x14
addl 156(%esp),%edx
# x7 = w
movl %ebx,128(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x12
xorl 148(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x6
xorl 124(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x1
xorl 104(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x11
xorl 144(%esp),%edi
# x12 = p
movl %eax,148(%esp)
# x6 = t
movl %edx,124(%esp)
# p += x8
addl 132(%esp),%eax
# x1 = r
movl %esi,104(%esp)
# t += x2
addl 108(%esp),%edx
# x11 = v
movl %edi,144(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# x0 = p
movl %eax,100(%esp)
# x10 = t
movl %edx,140(%esp)
# p += x3
addl 112(%esp),%eax
# p <<<= 7
rol $7,%eax
# x5 = s
movl %ecx,120(%esp)
# t += x9
addl 136(%esp),%edx
# x15 = w
movl %ebx,160(%esp)
# r = x4
movl 116(%esp),%esi
# r += s
add %ecx,%esi
# v = x14
movl 156(%esp),%edi
# v += w
add %ebx,%edi
# p ^= x1
xorl 104(%esp),%eax
# t <<<= 7
rol $7,%edx
# t ^= x11
xorl 144(%esp),%edx
# r <<<= 7
rol $7,%esi
# r ^= x6
xorl 124(%esp),%esi
# v <<<= 7
rol $7,%edi
# v ^= x12
xorl 148(%esp),%edi
# x1 = p
movl %eax,104(%esp)
# x11 = t
movl %edx,144(%esp)
# p += x0
addl 100(%esp),%eax
# x6 = r
movl %esi,124(%esp)
# t += x10
addl 140(%esp),%edx
# x12 = v
movl %edi,148(%esp)
# p <<<= 9
rol $9,%eax
# p ^= x2
xorl 108(%esp),%eax
# t <<<= 9
rol $9,%edx
# t ^= x8
xorl 132(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 9
rol $9,%ecx
# s ^= x7
xorl 128(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 9
rol $9,%ebx
# w ^= x13
xorl 152(%esp),%ebx
# x2 = p
movl %eax,108(%esp)
# x8 = t
movl %edx,132(%esp)
# p += x1
addl 104(%esp),%eax
# x7 = s
movl %ecx,128(%esp)
# t += x11
addl 144(%esp),%edx
# x13 = w
movl %ebx,152(%esp)
# p <<<= 13
rol $13,%eax
# p ^= x3
xorl 112(%esp),%eax
# t <<<= 13
rol $13,%edx
# t ^= x9
xorl 136(%esp),%edx
# r += s
add %ecx,%esi
# r <<<= 13
rol $13,%esi
# r ^= x4
xorl 116(%esp),%esi
# v += w
add %ebx,%edi
# v <<<= 13
rol $13,%edi
# v ^= x14
xorl 156(%esp),%edi
# x3 = p
movl %eax,112(%esp)
# x9 = t
movl %edx,136(%esp)
# p += x2
addl 108(%esp),%eax
# x4 = r
movl %esi,116(%esp)
# t += x8
addl 132(%esp),%edx
# x14 = v
movl %edi,156(%esp)
# p <<<= 18
rol $18,%eax
# p ^= x0
xorl 100(%esp),%eax
# t <<<= 18
rol $18,%edx
# t ^= x10
xorl 140(%esp),%edx
# s += r
add %esi,%ecx
# s <<<= 18
rol $18,%ecx
# s ^= x5
xorl 120(%esp),%ecx
# w += v
add %edi,%ebx
# w <<<= 18
rol $18,%ebx
# w ^= x15
xorl 160(%esp),%ebx
# i -= 4
sub $4,%ebp
# goto mainloop if unsigned >
ja ._mainloop
# x0 = p
movl %eax,100(%esp)
# x5 = s
movl %ecx,120(%esp)
# x10 = t
movl %edx,140(%esp)
# x15 = w
movl %ebx,160(%esp)
# out = out_backup
movl 72(%esp),%edi
# m = m_backup
movl 68(%esp),%esi
# in0 = x0
movl 100(%esp),%eax
# in1 = x1
movl 104(%esp),%ecx
# in0 += j0
addl 164(%esp),%eax
# in1 += j1
addl 168(%esp),%ecx
# in0 ^= *(uint32 *) (m + 0)
xorl 0(%esi),%eax
# in1 ^= *(uint32 *) (m + 4)
xorl 4(%esi),%ecx
# *(uint32 *) (out + 0) = in0
movl %eax,0(%edi)
# *(uint32 *) (out + 4) = in1
movl %ecx,4(%edi)
# in2 = x2
movl 108(%esp),%eax
# in3 = x3
movl 112(%esp),%ecx
# in2 += j2
addl 172(%esp),%eax
# in3 += j3
addl 176(%esp),%ecx
# in2 ^= *(uint32 *) (m + 8)
xorl 8(%esi),%eax
# in3 ^= *(uint32 *) (m + 12)
xorl 12(%esi),%ecx
# *(uint32 *) (out + 8) = in2
movl %eax,8(%edi)
# *(uint32 *) (out + 12) = in3
movl %ecx,12(%edi)
# in4 = x4
movl 116(%esp),%eax
# in5 = x5
movl 120(%esp),%ecx
# in4 += j4
addl 180(%esp),%eax
# in5 += j5
addl 184(%esp),%ecx
# in4 ^= *(uint32 *) (m + 16)
xorl 16(%esi),%eax
# in5 ^= *(uint32 *) (m + 20)
xorl 20(%esi),%ecx
# *(uint32 *) (out + 16) = in4
movl %eax,16(%edi)
# *(uint32 *) (out + 20) = in5
movl %ecx,20(%edi)
# in6 = x6
movl 124(%esp),%eax
# in7 = x7
movl 128(%esp),%ecx
# in6 += j6
addl 188(%esp),%eax
# in7 += j7
addl 192(%esp),%ecx
# in6 ^= *(uint32 *) (m + 24)
xorl 24(%esi),%eax
# in7 ^= *(uint32 *) (m + 28)
xorl 28(%esi),%ecx
# *(uint32 *) (out + 24) = in6
movl %eax,24(%edi)
# *(uint32 *) (out + 28) = in7
movl %ecx,28(%edi)
# in8 = x8
movl 132(%esp),%eax
# in9 = x9
movl 136(%esp),%ecx
# in8 += j8
addl 196(%esp),%eax
# in9 += j9
addl 200(%esp),%ecx
# in8 ^= *(uint32 *) (m + 32)
xorl 32(%esi),%eax
# in9 ^= *(uint32 *) (m + 36)
xorl 36(%esi),%ecx
# *(uint32 *) (out + 32) = in8
movl %eax,32(%edi)
# *(uint32 *) (out + 36) = in9
movl %ecx,36(%edi)
# in10 = x10
movl 140(%esp),%eax
# in11 = x11
movl 144(%esp),%ecx
# in10 += j10
addl 204(%esp),%eax
# in11 += j11
addl 208(%esp),%ecx
# in10 ^= *(uint32 *) (m + 40)
xorl 40(%esi),%eax
# in11 ^= *(uint32 *) (m + 44)
xorl 44(%esi),%ecx
# *(uint32 *) (out + 40) = in10
movl %eax,40(%edi)
# *(uint32 *) (out + 44) = in11
movl %ecx,44(%edi)
# in12 = x12
movl 148(%esp),%eax
# in13 = x13
movl 152(%esp),%ecx
# in12 += j12
addl 212(%esp),%eax
# in13 += j13
addl 216(%esp),%ecx
# in12 ^= *(uint32 *) (m + 48)
xorl 48(%esi),%eax
# in13 ^= *(uint32 *) (m + 52)
xorl 52(%esi),%ecx
# *(uint32 *) (out + 48) = in12
movl %eax,48(%edi)
# *(uint32 *) (out + 52) = in13
movl %ecx,52(%edi)
# in14 = x14
movl 156(%esp),%eax
# in15 = x15
movl 160(%esp),%ecx
# in14 += j14
addl 220(%esp),%eax
# in15 += j15
addl 224(%esp),%ecx
# in14 ^= *(uint32 *) (m + 56)
xorl 56(%esi),%eax
# in15 ^= *(uint32 *) (m + 60)
xorl 60(%esi),%ecx
# *(uint32 *) (out + 56) = in14
movl %eax,56(%edi)
# *(uint32 *) (out + 60) = in15
movl %ecx,60(%edi)
# bytes = bytes_backup
movl 76(%esp),%ebx
# in8 = j8
movl 196(%esp),%eax
# in9 = j9
movl 200(%esp),%ecx
# in8 += 1
add $1,%eax
# in9 += 0 + carry
adc $0,%ecx
# j8 = in8
movl %eax,196(%esp)
# j9 = in9
movl %ecx,200(%esp)
# bytes - 64
cmp $64,%ebx
# goto bytesatleast65 if unsigned>
ja ._bytesatleast65
# goto bytesatleast64 if unsigned>=
jae ._bytesatleast64
# m = out
mov %edi,%esi
# out = ctarget
movl 228(%esp),%edi
# i = bytes
mov %ebx,%ecx
# while (i) { *out++ = *m++; --i }
rep movsb
._bytesatleast64:
# x = x_backup
movl 64(%esp),%eax
# in8 = j8
movl 196(%esp),%ecx
# in9 = j9
movl 200(%esp),%edx
# *(uint32 *) (x + 32) = in8
movl %ecx,32(%eax)
# *(uint32 *) (x + 36) = in9
movl %edx,36(%eax)
._done:
# eax = eax_stack
movl 80(%esp),%eax
# ebx = ebx_stack
movl 84(%esp),%ebx
# esi = esi_stack
movl 88(%esp),%esi
# edi = edi_stack
movl 92(%esp),%edi
# ebp = ebp_stack
movl 96(%esp),%ebp
# leave
add %eax,%esp
ret
._bytesatleast65:
# bytes -= 64
sub $64,%ebx
# out += 64
add $64,%edi
# m += 64
add $64,%esi
# goto bytesatleast1
jmp ._bytesatleast1
ENDPROC(salsa20_encrypt_bytes)