linux_dsm_epyc7002/arch/x86/crypto/aesni-intel_asm.S
Huang Ying 12387a46bb crypto: aesni-intel - Add AES-NI accelerated CTR mode
To take advantage of the hardware pipeline implementation of AES-NI
instructions. CTR mode cryption is implemented in ASM to schedule
multiple AES-NI instructions one after another. This way, some latency
of AES-NI instruction can be eliminated.

Performance testing based on dm-crypt should 50% reduction of
ecryption/decryption time.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2010-03-10 18:28:55 +08:00

842 lines
17 KiB
ArmAsm

/*
* Implement AES algorithm in Intel AES-NI instructions.
*
* The white paper of AES-NI instructions can be downloaded from:
* http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
*
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
#include <asm/inst.h>
.text
#define STATE1 %xmm0
#define STATE2 %xmm4
#define STATE3 %xmm5
#define STATE4 %xmm6
#define STATE STATE1
#define IN1 %xmm1
#define IN2 %xmm7
#define IN3 %xmm8
#define IN4 %xmm9
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12
#define KEYP %rdi
#define OUTP %rsi
#define INP %rdx
#define LEN %rcx
#define IVP %r8
#define KLEN %r9d
#define T1 %r10
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
_key_expansion_128:
_key_expansion_256a:
pshufd $0b11111111, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
movaps %xmm0, (%rcx)
add $0x10, %rcx
ret
_key_expansion_192a:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
movaps %xmm2, %xmm5
movaps %xmm2, %xmm6
pslldq $4, %xmm5
pshufd $0b11111111, %xmm0, %xmm3
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
movaps %xmm6, (%rcx)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 16(%rcx)
add $0x20, %rcx
ret
_key_expansion_192b:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
pxor %xmm4, %xmm0
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
movaps %xmm2, %xmm5
pslldq $4, %xmm5
pshufd $0b11111111, %xmm0, %xmm3
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
movaps %xmm0, (%rcx)
add $0x10, %rcx
ret
_key_expansion_256b:
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
pxor %xmm4, %xmm2
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
movaps %xmm2, (%rcx)
add $0x10, %rcx
ret
/*
* int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
movups (%rsi), %xmm0 # user key (first 16 bytes)
movaps %xmm0, (%rdi)
lea 0x10(%rdi), %rcx # key addr
movl %edx, 480(%rdi)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
movups 0x10(%rsi), %xmm2 # other user key
movaps %xmm2, (%rcx)
add $0x10, %rcx
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
AESKEYGENASSIST 0x1 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
call _key_expansion_256a
AESKEYGENASSIST 0x2 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
call _key_expansion_256a
AESKEYGENASSIST 0x4 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
call _key_expansion_256a
AESKEYGENASSIST 0x8 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
call _key_expansion_256a
AESKEYGENASSIST 0x10 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
call _key_expansion_256a
AESKEYGENASSIST 0x20 %xmm0 %xmm1
call _key_expansion_256b
AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
movq 0x10(%rsi), %xmm2 # other user key
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
call _key_expansion_192b
AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
call _key_expansion_192a
AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
call _key_expansion_192b
AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
call _key_expansion_192a
AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
call _key_expansion_192b
AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
call _key_expansion_192a
AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
call _key_expansion_192b
jmp .Ldec_key
.Lenc_key128:
AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
call _key_expansion_128
AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
call _key_expansion_128
AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
call _key_expansion_128
AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
call _key_expansion_128
AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
call _key_expansion_128
AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
call _key_expansion_128
AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
call _key_expansion_128
AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
call _key_expansion_128
AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
call _key_expansion_128
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
sub $0x10, %rcx
movaps (%rdi), %xmm0
movaps (%rcx), %xmm1
movaps %xmm0, 240(%rcx)
movaps %xmm1, 240(%rdi)
add $0x10, %rdi
lea 240-16(%rcx), %rsi
.align 4
.Ldec_key_loop:
movaps (%rdi), %xmm0
AESIMC %xmm0 %xmm1
movaps %xmm1, (%rsi)
add $0x10, %rdi
sub $0x10, %rsi
cmp %rcx, %rdi
jb .Ldec_key_loop
xor %rax, %rax
ret
/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
ret
/*
* _aesni_enc1: internal ABI
* input:
* KEYP: key struct pointer
* KLEN: round count
* STATE: initial state (input)
* output:
* STATE: finial state (output)
* changed:
* KEY
* TKEYP (T1)
*/
_aesni_enc1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE # round 0
add $0x30, TKEYP
cmp $24, KLEN
jb .Lenc128
lea 0x20(TKEYP), TKEYP
je .Lenc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
AESENC KEY STATE
movaps -0x50(TKEYP), KEY
AESENC KEY STATE
.align 4
.Lenc192:
movaps -0x40(TKEYP), KEY
AESENC KEY STATE
movaps -0x30(TKEYP), KEY
AESENC KEY STATE
.align 4
.Lenc128:
movaps -0x20(TKEYP), KEY
AESENC KEY STATE
movaps -0x10(TKEYP), KEY
AESENC KEY STATE
movaps (TKEYP), KEY
AESENC KEY STATE
movaps 0x10(TKEYP), KEY
AESENC KEY STATE
movaps 0x20(TKEYP), KEY
AESENC KEY STATE
movaps 0x30(TKEYP), KEY
AESENC KEY STATE
movaps 0x40(TKEYP), KEY
AESENC KEY STATE
movaps 0x50(TKEYP), KEY
AESENC KEY STATE
movaps 0x60(TKEYP), KEY
AESENC KEY STATE
movaps 0x70(TKEYP), KEY
AESENCLAST KEY STATE
ret
/*
* _aesni_enc4: internal ABI
* input:
* KEYP: key struct pointer
* KLEN: round count
* STATE1: initial state (input)
* STATE2
* STATE3
* STATE4
* output:
* STATE1: finial state (output)
* STATE2
* STATE3
* STATE4
* changed:
* KEY
* TKEYP (T1)
*/
_aesni_enc4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE1 # round 0
pxor KEY, STATE2
pxor KEY, STATE3
pxor KEY, STATE4
add $0x30, TKEYP
cmp $24, KLEN
jb .L4enc128
lea 0x20(TKEYP), TKEYP
je .L4enc192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps -0x50(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
#.align 4
.L4enc192:
movaps -0x40(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps -0x30(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
#.align 4
.L4enc128:
movaps -0x20(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps -0x10(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps (TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x10(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x20(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x30(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x40(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x50(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x60(TKEYP), KEY
AESENC KEY STATE1
AESENC KEY STATE2
AESENC KEY STATE3
AESENC KEY STATE4
movaps 0x70(TKEYP), KEY
AESENCLAST KEY STATE1 # last round
AESENCLAST KEY STATE2
AESENCLAST KEY STATE3
AESENCLAST KEY STATE4
ret
/*
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
ret
/*
* _aesni_dec1: internal ABI
* input:
* KEYP: key struct pointer
* KLEN: key length
* STATE: initial state (input)
* output:
* STATE: finial state (output)
* changed:
* KEY
* TKEYP (T1)
*/
_aesni_dec1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE # round 0
add $0x30, TKEYP
cmp $24, KLEN
jb .Ldec128
lea 0x20(TKEYP), TKEYP
je .Ldec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
AESDEC KEY STATE
movaps -0x50(TKEYP), KEY
AESDEC KEY STATE
.align 4
.Ldec192:
movaps -0x40(TKEYP), KEY
AESDEC KEY STATE
movaps -0x30(TKEYP), KEY
AESDEC KEY STATE
.align 4
.Ldec128:
movaps -0x20(TKEYP), KEY
AESDEC KEY STATE
movaps -0x10(TKEYP), KEY
AESDEC KEY STATE
movaps (TKEYP), KEY
AESDEC KEY STATE
movaps 0x10(TKEYP), KEY
AESDEC KEY STATE
movaps 0x20(TKEYP), KEY
AESDEC KEY STATE
movaps 0x30(TKEYP), KEY
AESDEC KEY STATE
movaps 0x40(TKEYP), KEY
AESDEC KEY STATE
movaps 0x50(TKEYP), KEY
AESDEC KEY STATE
movaps 0x60(TKEYP), KEY
AESDEC KEY STATE
movaps 0x70(TKEYP), KEY
AESDECLAST KEY STATE
ret
/*
* _aesni_dec4: internal ABI
* input:
* KEYP: key struct pointer
* KLEN: key length
* STATE1: initial state (input)
* STATE2
* STATE3
* STATE4
* output:
* STATE1: finial state (output)
* STATE2
* STATE3
* STATE4
* changed:
* KEY
* TKEYP (T1)
*/
_aesni_dec4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
pxor KEY, STATE1 # round 0
pxor KEY, STATE2
pxor KEY, STATE3
pxor KEY, STATE4
add $0x30, TKEYP
cmp $24, KLEN
jb .L4dec128
lea 0x20(TKEYP), TKEYP
je .L4dec192
add $0x20, TKEYP
movaps -0x60(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps -0x50(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
.align 4
.L4dec192:
movaps -0x40(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps -0x30(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
.align 4
.L4dec128:
movaps -0x20(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps -0x10(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps (TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x10(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x20(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x30(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x40(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x50(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x60(TKEYP), KEY
AESDEC KEY STATE1
AESDEC KEY STATE2
AESDEC KEY STATE3
AESDEC KEY STATE4
movaps 0x70(TKEYP), KEY
AESDECLAST KEY STATE1 # last round
AESDECLAST KEY STATE2
AESDECLAST KEY STATE3
AESDECLAST KEY STATE4
ret
/*
* void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len)
*/
ENTRY(aesni_ecb_enc)
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
cmp $16, LEN
jb .Lecb_enc_ret
cmp $64, LEN
jb .Lecb_enc_loop1
.align 4
.Lecb_enc_loop4:
movups (INP), STATE1
movups 0x10(INP), STATE2
movups 0x20(INP), STATE3
movups 0x30(INP), STATE4
call _aesni_enc4
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
movups STATE4, 0x30(OUTP)
sub $64, LEN
add $64, INP
add $64, OUTP
cmp $64, LEN
jge .Lecb_enc_loop4
cmp $16, LEN
jb .Lecb_enc_ret
.align 4
.Lecb_enc_loop1:
movups (INP), STATE1
call _aesni_enc1
movups STATE1, (OUTP)
sub $16, LEN
add $16, INP
add $16, OUTP
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
ret
/*
* void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len);
*/
ENTRY(aesni_ecb_dec)
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
add $240, KEYP
cmp $16, LEN
jb .Lecb_dec_ret
cmp $64, LEN
jb .Lecb_dec_loop1
.align 4
.Lecb_dec_loop4:
movups (INP), STATE1
movups 0x10(INP), STATE2
movups 0x20(INP), STATE3
movups 0x30(INP), STATE4
call _aesni_dec4
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
movups STATE4, 0x30(OUTP)
sub $64, LEN
add $64, INP
add $64, OUTP
cmp $64, LEN
jge .Lecb_dec_loop4
cmp $16, LEN
jb .Lecb_dec_ret
.align 4
.Lecb_dec_loop1:
movups (INP), STATE1
call _aesni_dec1
movups STATE1, (OUTP)
sub $16, LEN
add $16, INP
add $16, OUTP
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
ret
/*
* void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
cmp $16, LEN
jb .Lcbc_enc_ret
mov 480(KEYP), KLEN
movups (IVP), STATE # load iv as initial state
.align 4
.Lcbc_enc_loop:
movups (INP), IN # load input
pxor IN, STATE
call _aesni_enc1
movups STATE, (OUTP) # store output
sub $16, LEN
add $16, INP
add $16, OUTP
cmp $16, LEN
jge .Lcbc_enc_loop
movups STATE, (IVP)
.Lcbc_enc_ret:
ret
/*
* void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
cmp $16, LEN
jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
add $240, KEYP
movups (IVP), IV
cmp $64, LEN
jb .Lcbc_dec_loop1
.align 4
.Lcbc_dec_loop4:
movups (INP), IN1
movaps IN1, STATE1
movups 0x10(INP), IN2
movaps IN2, STATE2
movups 0x20(INP), IN3
movaps IN3, STATE3
movups 0x30(INP), IN4
movaps IN4, STATE4
call _aesni_dec4
pxor IV, STATE1
pxor IN1, STATE2
pxor IN2, STATE3
pxor IN3, STATE4
movaps IN4, IV
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
movups STATE4, 0x30(OUTP)
sub $64, LEN
add $64, INP
add $64, OUTP
cmp $64, LEN
jge .Lcbc_dec_loop4
cmp $16, LEN
jb .Lcbc_dec_ret
.align 4
.Lcbc_dec_loop1:
movups (INP), IN
movaps IN, STATE
call _aesni_dec1
pxor IV, STATE
movups STATE, (OUTP)
movaps IN, IV
sub $16, LEN
add $16, INP
add $16, OUTP
cmp $16, LEN
jge .Lcbc_dec_loop1
.Lcbc_dec_ret:
movups IV, (IVP)
.Lcbc_dec_just_ret:
ret
.align 16
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
/*
* _aesni_inc_init: internal ABI
* setup registers used by _aesni_inc
* input:
* IV
* output:
* CTR: == IV, in little endian
* TCTR_LOW: == lower qword of CTR
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
*/
_aesni_inc_init:
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
PSHUFB_XMM BSWAP_MASK CTR
mov $1, TCTR_LOW
movq TCTR_LOW, INC
movq CTR, TCTR_LOW
ret
/*
* _aesni_inc: internal ABI
* Increase IV by 1, IV is in big endian
* input:
* IV
* CTR: == IV, in little endian
* TCTR_LOW: == lower qword of CTR
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
* output:
* IV: Increase by 1
* changed:
* CTR: == output IV, in little endian
* TCTR_LOW: == lower qword of CTR
*/
_aesni_inc:
paddq INC, CTR
add $1, TCTR_LOW
jnc .Linc_low
pslldq $8, INC
paddq INC, CTR
psrldq $8, INC
.Linc_low:
movaps CTR, IV
PSHUFB_XMM BSWAP_MASK IV
ret
/*
* void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
* size_t len, u8 *iv)
*/
ENTRY(aesni_ctr_enc)
cmp $16, LEN
jb .Lctr_enc_just_ret
mov 480(KEYP), KLEN
movups (IVP), IV
call _aesni_inc_init
cmp $64, LEN
jb .Lctr_enc_loop1
.align 4
.Lctr_enc_loop4:
movaps IV, STATE1
call _aesni_inc
movups (INP), IN1
movaps IV, STATE2
call _aesni_inc
movups 0x10(INP), IN2
movaps IV, STATE3
call _aesni_inc
movups 0x20(INP), IN3
movaps IV, STATE4
call _aesni_inc
movups 0x30(INP), IN4
call _aesni_enc4
pxor IN1, STATE1
movups STATE1, (OUTP)
pxor IN2, STATE2
movups STATE2, 0x10(OUTP)
pxor IN3, STATE3
movups STATE3, 0x20(OUTP)
pxor IN4, STATE4
movups STATE4, 0x30(OUTP)
sub $64, LEN
add $64, INP
add $64, OUTP
cmp $64, LEN
jge .Lctr_enc_loop4
cmp $16, LEN
jb .Lctr_enc_ret
.align 4
.Lctr_enc_loop1:
movaps IV, STATE
call _aesni_inc
movups (INP), IN
call _aesni_enc1
pxor IN, STATE
movups STATE, (OUTP)
sub $16, LEN
add $16, INP
add $16, OUTP
cmp $16, LEN
jge .Lctr_enc_loop1
.Lctr_enc_ret:
movups IV, (IVP)
.Lctr_enc_just_ret:
ret