mirror of
https://github.com/AuxXxilium/linux_dsm_epyc7002.git
synced 2025-01-16 01:16:16 +07:00
16dee78005
We use _GLOBAL so there is no need to do the manual alignment, in fact it causes a build failure. Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
3790 lines
91 KiB
Raku
3790 lines
91 KiB
Raku
#! /usr/bin/env perl
|
|
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# This module implements support for AES instructions as per PowerISA
|
|
# specification version 2.07, first implemented by POWER8 processor.
|
|
# The module is endian-agnostic in sense that it supports both big-
|
|
# and little-endian cases. Data alignment in parallelizable modes is
|
|
# handled with VSX loads and stores, which implies MSR.VSX flag being
|
|
# set. It should also be noted that ISA specification doesn't prohibit
|
|
# alignment exceptions for these instructions on page boundaries.
|
|
# Initially alignment was handled in pure AltiVec/VMX way [when data
|
|
# is aligned programmatically, which in turn guarantees exception-
|
|
# free execution], but it turned to hamper performance when vcipher
|
|
# instructions are interleaved. It's reckoned that eventual
|
|
# misalignment penalties at page boundaries are in average lower
|
|
# than additional overhead in pure AltiVec approach.
|
|
#
|
|
# May 2016
|
|
#
|
|
# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
|
|
# systems were measured.
|
|
#
|
|
######################################################################
|
|
# Current large-block performance in cycles per byte processed with
|
|
# 128-bit key (less is better).
|
|
#
|
|
# CBC en-/decrypt CTR XTS
|
|
# POWER8[le] 3.96/0.72 0.74 1.1
|
|
# POWER8[be] 3.75/0.65 0.66 1.0
|
|
|
|
$flavour = shift;
|
|
|
|
if ($flavour =~ /64/) {
|
|
$SIZE_T =8;
|
|
$LRSAVE =2*$SIZE_T;
|
|
$STU ="stdu";
|
|
$POP ="ld";
|
|
$PUSH ="std";
|
|
$UCMP ="cmpld";
|
|
$SHL ="sldi";
|
|
} elsif ($flavour =~ /32/) {
|
|
$SIZE_T =4;
|
|
$LRSAVE =$SIZE_T;
|
|
$STU ="stwu";
|
|
$POP ="lwz";
|
|
$PUSH ="stw";
|
|
$UCMP ="cmplw";
|
|
$SHL ="slwi";
|
|
} else { die "nonsense $flavour"; }
|
|
|
|
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
die "can't locate ppc-xlate.pl";
|
|
|
|
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
|
|
|
|
$FRAME=8*$SIZE_T;
|
|
$prefix="aes_p8";
|
|
|
|
$sp="r1";
|
|
$vrsave="r12";
|
|
|
|
#########################################################################
|
|
{{{ # Key setup procedures #
|
|
my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
|
|
my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
|
|
my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
|
|
|
|
$code.=<<___;
|
|
.machine "any"
|
|
|
|
.text
|
|
|
|
.align 7
|
|
rcon:
|
|
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
|
|
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
|
|
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
|
|
.long 0,0,0,0 ?asis
|
|
Lconsts:
|
|
mflr r0
|
|
bcl 20,31,\$+4
|
|
mflr $ptr #vvvvv "distance between . and rcon
|
|
addi $ptr,$ptr,-0x48
|
|
mtlr r0
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,0,0
|
|
.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
|
|
|
|
.globl .${prefix}_set_encrypt_key
|
|
Lset_encrypt_key:
|
|
mflr r11
|
|
$PUSH r11,$LRSAVE($sp)
|
|
|
|
li $ptr,-1
|
|
${UCMP}i $inp,0
|
|
beq- Lenc_key_abort # if ($inp==0) return -1;
|
|
${UCMP}i $out,0
|
|
beq- Lenc_key_abort # if ($out==0) return -1;
|
|
li $ptr,-2
|
|
cmpwi $bits,128
|
|
blt- Lenc_key_abort
|
|
cmpwi $bits,256
|
|
bgt- Lenc_key_abort
|
|
andi. r0,$bits,0x3f
|
|
bne- Lenc_key_abort
|
|
|
|
lis r0,0xfff0
|
|
mfspr $vrsave,256
|
|
mtspr 256,r0
|
|
|
|
bl Lconsts
|
|
mtlr r11
|
|
|
|
neg r9,$inp
|
|
lvx $in0,0,$inp
|
|
addi $inp,$inp,15 # 15 is not typo
|
|
lvsr $key,0,r9 # borrow $key
|
|
li r8,0x20
|
|
cmpwi $bits,192
|
|
lvx $in1,0,$inp
|
|
le?vspltisb $mask,0x0f # borrow $mask
|
|
lvx $rcon,0,$ptr
|
|
le?vxor $key,$key,$mask # adjust for byte swap
|
|
lvx $mask,r8,$ptr
|
|
addi $ptr,$ptr,0x10
|
|
vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
|
|
li $cnt,8
|
|
vxor $zero,$zero,$zero
|
|
mtctr $cnt
|
|
|
|
?lvsr $outperm,0,$out
|
|
vspltisb $outmask,-1
|
|
lvx $outhead,0,$out
|
|
?vperm $outmask,$zero,$outmask,$outperm
|
|
|
|
blt Loop128
|
|
addi $inp,$inp,8
|
|
beq L192
|
|
addi $inp,$inp,8
|
|
b L256
|
|
|
|
.align 4
|
|
Loop128:
|
|
vperm $key,$in0,$in0,$mask # rotate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vcipherlast $key,$key,$rcon
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vadduwm $rcon,$rcon,$rcon
|
|
vxor $in0,$in0,$key
|
|
bdnz Loop128
|
|
|
|
lvx $rcon,0,$ptr # last two round keys
|
|
|
|
vperm $key,$in0,$in0,$mask # rotate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vcipherlast $key,$key,$rcon
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vadduwm $rcon,$rcon,$rcon
|
|
vxor $in0,$in0,$key
|
|
|
|
vperm $key,$in0,$in0,$mask # rotate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vcipherlast $key,$key,$rcon
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vxor $in0,$in0,$key
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
stvx $stage,0,$out
|
|
|
|
addi $inp,$out,15 # 15 is not typo
|
|
addi $out,$out,0x50
|
|
|
|
li $rounds,10
|
|
b Ldone
|
|
|
|
.align 4
|
|
L192:
|
|
lvx $tmp,0,$inp
|
|
li $cnt,4
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
|
|
vspltisb $key,8 # borrow $key
|
|
mtctr $cnt
|
|
vsububm $mask,$mask,$key # adjust the mask
|
|
|
|
Loop192:
|
|
vperm $key,$in1,$in1,$mask # roate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vcipherlast $key,$key,$rcon
|
|
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
|
|
vsldoi $stage,$zero,$in1,8
|
|
vspltw $tmp,$in0,3
|
|
vxor $tmp,$tmp,$in1
|
|
vsldoi $in1,$zero,$in1,12 # >>32
|
|
vadduwm $rcon,$rcon,$rcon
|
|
vxor $in1,$in1,$tmp
|
|
vxor $in0,$in0,$key
|
|
vxor $in1,$in1,$key
|
|
vsldoi $stage,$stage,$in0,8
|
|
|
|
vperm $key,$in1,$in1,$mask # rotate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vperm $outtail,$stage,$stage,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vcipherlast $key,$key,$rcon
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vsldoi $stage,$in0,$in1,8
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vperm $outtail,$stage,$stage,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vspltw $tmp,$in0,3
|
|
vxor $tmp,$tmp,$in1
|
|
vsldoi $in1,$zero,$in1,12 # >>32
|
|
vadduwm $rcon,$rcon,$rcon
|
|
vxor $in1,$in1,$tmp
|
|
vxor $in0,$in0,$key
|
|
vxor $in1,$in1,$key
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
stvx $stage,0,$out
|
|
addi $inp,$out,15 # 15 is not typo
|
|
addi $out,$out,16
|
|
bdnz Loop192
|
|
|
|
li $rounds,12
|
|
addi $out,$out,0x20
|
|
b Ldone
|
|
|
|
.align 4
|
|
L256:
|
|
lvx $tmp,0,$inp
|
|
li $cnt,7
|
|
li $rounds,14
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
|
|
mtctr $cnt
|
|
|
|
Loop256:
|
|
vperm $key,$in1,$in1,$mask # rotate-n-splat
|
|
vsldoi $tmp,$zero,$in0,12 # >>32
|
|
vperm $outtail,$in1,$in1,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
vcipherlast $key,$key,$rcon
|
|
stvx $stage,0,$out
|
|
addi $out,$out,16
|
|
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in0,$in0,$tmp
|
|
vadduwm $rcon,$rcon,$rcon
|
|
vxor $in0,$in0,$key
|
|
vperm $outtail,$in0,$in0,$outperm # rotate
|
|
vsel $stage,$outhead,$outtail,$outmask
|
|
vmr $outhead,$outtail
|
|
stvx $stage,0,$out
|
|
addi $inp,$out,15 # 15 is not typo
|
|
addi $out,$out,16
|
|
bdz Ldone
|
|
|
|
vspltw $key,$in0,3 # just splat
|
|
vsldoi $tmp,$zero,$in1,12 # >>32
|
|
vsbox $key,$key
|
|
|
|
vxor $in1,$in1,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in1,$in1,$tmp
|
|
vsldoi $tmp,$zero,$tmp,12 # >>32
|
|
vxor $in1,$in1,$tmp
|
|
|
|
vxor $in1,$in1,$key
|
|
b Loop256
|
|
|
|
.align 4
|
|
Ldone:
|
|
lvx $in1,0,$inp # redundant in aligned case
|
|
vsel $in1,$outhead,$in1,$outmask
|
|
stvx $in1,0,$inp
|
|
li $ptr,0
|
|
mtspr 256,$vrsave
|
|
stw $rounds,0($out)
|
|
|
|
Lenc_key_abort:
|
|
mr r3,$ptr
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,1,0,0,3,0
|
|
.long 0
|
|
.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
|
|
|
|
.globl .${prefix}_set_decrypt_key
|
|
$STU $sp,-$FRAME($sp)
|
|
mflr r10
|
|
$PUSH r10,$FRAME+$LRSAVE($sp)
|
|
bl Lset_encrypt_key
|
|
mtlr r10
|
|
|
|
cmpwi r3,0
|
|
bne- Ldec_key_abort
|
|
|
|
slwi $cnt,$rounds,4
|
|
subi $inp,$out,240 # first round key
|
|
srwi $rounds,$rounds,1
|
|
add $out,$inp,$cnt # last round key
|
|
mtctr $rounds
|
|
|
|
Ldeckey:
|
|
lwz r0, 0($inp)
|
|
lwz r6, 4($inp)
|
|
lwz r7, 8($inp)
|
|
lwz r8, 12($inp)
|
|
addi $inp,$inp,16
|
|
lwz r9, 0($out)
|
|
lwz r10,4($out)
|
|
lwz r11,8($out)
|
|
lwz r12,12($out)
|
|
stw r0, 0($out)
|
|
stw r6, 4($out)
|
|
stw r7, 8($out)
|
|
stw r8, 12($out)
|
|
subi $out,$out,16
|
|
stw r9, -16($inp)
|
|
stw r10,-12($inp)
|
|
stw r11,-8($inp)
|
|
stw r12,-4($inp)
|
|
bdnz Ldeckey
|
|
|
|
xor r3,r3,r3 # return value
|
|
Ldec_key_abort:
|
|
addi $sp,$sp,$FRAME
|
|
blr
|
|
.long 0
|
|
.byte 0,12,4,1,0x80,0,3,0
|
|
.long 0
|
|
.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
|
|
___
|
|
}}}
|
|
#########################################################################
|
|
{{{ # Single block en- and decrypt procedures #
|
|
sub gen_block () {
|
|
my $dir = shift;
|
|
my $n = $dir eq "de" ? "n" : "";
|
|
my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
|
|
|
|
$code.=<<___;
|
|
.globl .${prefix}_${dir}crypt
|
|
lwz $rounds,240($key)
|
|
lis r0,0xfc00
|
|
mfspr $vrsave,256
|
|
li $idx,15 # 15 is not typo
|
|
mtspr 256,r0
|
|
|
|
lvx v0,0,$inp
|
|
neg r11,$out
|
|
lvx v1,$idx,$inp
|
|
lvsl v2,0,$inp # inpperm
|
|
le?vspltisb v4,0x0f
|
|
?lvsl v3,0,r11 # outperm
|
|
le?vxor v2,v2,v4
|
|
li $idx,16
|
|
vperm v0,v0,v1,v2 # align [and byte swap in LE]
|
|
lvx v1,0,$key
|
|
?lvsl v5,0,$key # keyperm
|
|
srwi $rounds,$rounds,1
|
|
lvx v2,$idx,$key
|
|
addi $idx,$idx,16
|
|
subi $rounds,$rounds,1
|
|
?vperm v1,v1,v2,v5 # align round key
|
|
|
|
vxor v0,v0,v1
|
|
lvx v1,$idx,$key
|
|
addi $idx,$idx,16
|
|
mtctr $rounds
|
|
|
|
Loop_${dir}c:
|
|
?vperm v2,v2,v1,v5
|
|
v${n}cipher v0,v0,v2
|
|
lvx v2,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm v1,v1,v2,v5
|
|
v${n}cipher v0,v0,v1
|
|
lvx v1,$idx,$key
|
|
addi $idx,$idx,16
|
|
bdnz Loop_${dir}c
|
|
|
|
?vperm v2,v2,v1,v5
|
|
v${n}cipher v0,v0,v2
|
|
lvx v2,$idx,$key
|
|
?vperm v1,v1,v2,v5
|
|
v${n}cipherlast v0,v0,v1
|
|
|
|
vspltisb v2,-1
|
|
vxor v1,v1,v1
|
|
li $idx,15 # 15 is not typo
|
|
?vperm v2,v1,v2,v3 # outmask
|
|
le?vxor v3,v3,v4
|
|
lvx v1,0,$out # outhead
|
|
vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
|
|
vsel v1,v1,v0,v2
|
|
lvx v4,$idx,$out
|
|
stvx v1,0,$out
|
|
vsel v0,v0,v4,v2
|
|
stvx v0,$idx,$out
|
|
|
|
mtspr 256,$vrsave
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,3,0
|
|
.long 0
|
|
.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
|
|
___
|
|
}
|
|
&gen_block("en");
|
|
&gen_block("de");
|
|
}}}
|
|
#########################################################################
|
|
{{{ # CBC en- and decrypt procedures #
|
|
my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
|
|
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
|
|
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
|
|
map("v$_",(4..10));
|
|
$code.=<<___;
|
|
.globl .${prefix}_cbc_encrypt
|
|
${UCMP}i $len,16
|
|
bltlr-
|
|
|
|
cmpwi $enc,0 # test direction
|
|
lis r0,0xffe0
|
|
mfspr $vrsave,256
|
|
mtspr 256,r0
|
|
|
|
li $idx,15
|
|
vxor $rndkey0,$rndkey0,$rndkey0
|
|
le?vspltisb $tmp,0x0f
|
|
|
|
lvx $ivec,0,$ivp # load [unaligned] iv
|
|
lvsl $inpperm,0,$ivp
|
|
lvx $inptail,$idx,$ivp
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
vperm $ivec,$ivec,$inptail,$inpperm
|
|
|
|
neg r11,$inp
|
|
?lvsl $keyperm,0,$key # prepare for unaligned key
|
|
lwz $rounds,240($key)
|
|
|
|
lvsr $inpperm,0,r11 # prepare for unaligned load
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,15 # 15 is not typo
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
|
|
?lvsr $outperm,0,$out # prepare for unaligned store
|
|
vspltisb $outmask,-1
|
|
lvx $outhead,0,$out
|
|
?vperm $outmask,$rndkey0,$outmask,$outperm
|
|
le?vxor $outperm,$outperm,$tmp
|
|
|
|
srwi $rounds,$rounds,1
|
|
li $idx,16
|
|
subi $rounds,$rounds,1
|
|
beq Lcbc_dec
|
|
|
|
Lcbc_enc:
|
|
vmr $inout,$inptail
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
mtctr $rounds
|
|
subi $len,$len,16 # len-=16
|
|
|
|
lvx $rndkey0,0,$key
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
vxor $inout,$inout,$ivec
|
|
|
|
Loop_cbc_enc:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
bdnz Loop_cbc_enc
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
li $idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipherlast $ivec,$inout,$rndkey0
|
|
${UCMP}i $len,16
|
|
|
|
vperm $tmp,$ivec,$ivec,$outperm
|
|
vsel $inout,$outhead,$tmp,$outmask
|
|
vmr $outhead,$tmp
|
|
stvx $inout,0,$out
|
|
addi $out,$out,16
|
|
bge Lcbc_enc
|
|
|
|
b Lcbc_done
|
|
|
|
.align 4
|
|
Lcbc_dec:
|
|
${UCMP}i $len,128
|
|
bge _aesp8_cbc_decrypt8x
|
|
vmr $tmp,$inptail
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
mtctr $rounds
|
|
subi $len,$len,16 # len-=16
|
|
|
|
lvx $rndkey0,0,$key
|
|
vperm $tmp,$tmp,$inptail,$inpperm
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$tmp,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
|
|
Loop_cbc_dec:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vncipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
bdnz Loop_cbc_dec
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
li $idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vncipherlast $inout,$inout,$rndkey0
|
|
${UCMP}i $len,16
|
|
|
|
vxor $inout,$inout,$ivec
|
|
vmr $ivec,$tmp
|
|
vperm $tmp,$inout,$inout,$outperm
|
|
vsel $inout,$outhead,$tmp,$outmask
|
|
vmr $outhead,$tmp
|
|
stvx $inout,0,$out
|
|
addi $out,$out,16
|
|
bge Lcbc_dec
|
|
|
|
Lcbc_done:
|
|
addi $out,$out,-1
|
|
lvx $inout,0,$out # redundant in aligned case
|
|
vsel $inout,$outhead,$inout,$outmask
|
|
stvx $inout,0,$out
|
|
|
|
neg $enc,$ivp # write [unaligned] iv
|
|
li $idx,15 # 15 is not typo
|
|
vxor $rndkey0,$rndkey0,$rndkey0
|
|
vspltisb $outmask,-1
|
|
le?vspltisb $tmp,0x0f
|
|
?lvsl $outperm,0,$enc
|
|
?vperm $outmask,$rndkey0,$outmask,$outperm
|
|
le?vxor $outperm,$outperm,$tmp
|
|
lvx $outhead,0,$ivp
|
|
vperm $ivec,$ivec,$ivec,$outperm
|
|
vsel $inout,$outhead,$ivec,$outmask
|
|
lvx $inptail,$idx,$ivp
|
|
stvx $inout,0,$ivp
|
|
vsel $inout,$ivec,$inptail,$outmask
|
|
stvx $inout,$idx,$ivp
|
|
|
|
mtspr 256,$vrsave
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,6,0
|
|
.long 0
|
|
___
|
|
#########################################################################
|
|
{{ # Optimized CBC decrypt procedure #
|
|
my $key_="r11";
|
|
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
|
|
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
|
|
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
|
|
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
|
|
# v26-v31 last 6 round keys
|
|
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
_aesp8_cbc_decrypt8x:
|
|
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
|
|
li r10,`$FRAME+8*16+15`
|
|
li r11,`$FRAME+8*16+31`
|
|
stvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
stvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v30,r10,$sp
|
|
stvx v31,r11,$sp
|
|
li r0,-1
|
|
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
|
|
li $x10,0x10
|
|
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
li $x20,0x20
|
|
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
li $x30,0x30
|
|
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
li $x40,0x40
|
|
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
li $x50,0x50
|
|
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
li $x60,0x60
|
|
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
li $x70,0x70
|
|
mtspr 256,r0
|
|
|
|
subi $rounds,$rounds,3 # -4 in total
|
|
subi $len,$len,128 # bias
|
|
|
|
lvx $rndkey0,$x00,$key # load key schedule
|
|
lvx v30,$x10,$key
|
|
addi $key,$key,0x20
|
|
lvx v31,$x00,$key
|
|
?vperm $rndkey0,$rndkey0,v30,$keyperm
|
|
addi $key_,$sp,$FRAME+15
|
|
mtctr $rounds
|
|
|
|
Load_cbc_dec_key:
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v30,$x10,$key
|
|
addi $key,$key,0x20
|
|
stvx v24,$x00,$key_ # off-load round[1]
|
|
?vperm v25,v31,v30,$keyperm
|
|
lvx v31,$x00,$key
|
|
stvx v25,$x10,$key_ # off-load round[2]
|
|
addi $key_,$key_,0x20
|
|
bdnz Load_cbc_dec_key
|
|
|
|
lvx v26,$x10,$key
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v27,$x20,$key
|
|
stvx v24,$x00,$key_ # off-load round[3]
|
|
?vperm v25,v31,v26,$keyperm
|
|
lvx v28,$x30,$key
|
|
stvx v25,$x10,$key_ # off-load round[4]
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
?vperm v26,v26,v27,$keyperm
|
|
lvx v29,$x40,$key
|
|
?vperm v27,v27,v28,$keyperm
|
|
lvx v30,$x50,$key
|
|
?vperm v28,v28,v29,$keyperm
|
|
lvx v31,$x60,$key
|
|
?vperm v29,v29,v30,$keyperm
|
|
lvx $out0,$x70,$key # borrow $out0
|
|
?vperm v30,v30,v31,$keyperm
|
|
lvx v24,$x00,$key_ # pre-load round[1]
|
|
?vperm v31,v31,$out0,$keyperm
|
|
lvx v25,$x10,$key_ # pre-load round[2]
|
|
|
|
#lvx $inptail,0,$inp # "caller" already did this
|
|
#addi $inp,$inp,15 # 15 is not typo
|
|
subi $inp,$inp,15 # undo "caller"
|
|
|
|
le?li $idx,8
|
|
lvx_u $in0,$x00,$inp # load first 8 "words"
|
|
le?lvsl $inpperm,0,$idx
|
|
le?vspltisb $tmp,0x0f
|
|
lvx_u $in1,$x10,$inp
|
|
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
|
|
lvx_u $in2,$x20,$inp
|
|
le?vperm $in0,$in0,$in0,$inpperm
|
|
lvx_u $in3,$x30,$inp
|
|
le?vperm $in1,$in1,$in1,$inpperm
|
|
lvx_u $in4,$x40,$inp
|
|
le?vperm $in2,$in2,$in2,$inpperm
|
|
vxor $out0,$in0,$rndkey0
|
|
lvx_u $in5,$x50,$inp
|
|
le?vperm $in3,$in3,$in3,$inpperm
|
|
vxor $out1,$in1,$rndkey0
|
|
lvx_u $in6,$x60,$inp
|
|
le?vperm $in4,$in4,$in4,$inpperm
|
|
vxor $out2,$in2,$rndkey0
|
|
lvx_u $in7,$x70,$inp
|
|
addi $inp,$inp,0x80
|
|
le?vperm $in5,$in5,$in5,$inpperm
|
|
vxor $out3,$in3,$rndkey0
|
|
le?vperm $in6,$in6,$in6,$inpperm
|
|
vxor $out4,$in4,$rndkey0
|
|
le?vperm $in7,$in7,$in7,$inpperm
|
|
vxor $out5,$in5,$rndkey0
|
|
vxor $out6,$in6,$rndkey0
|
|
vxor $out7,$in7,$rndkey0
|
|
|
|
mtctr $rounds
|
|
b Loop_cbc_dec8x
|
|
.align 5
|
|
Loop_cbc_dec8x:
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
vncipher $out6,$out6,v24
|
|
vncipher $out7,$out7,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
vncipher $out6,$out6,v25
|
|
vncipher $out7,$out7,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_cbc_dec8x
|
|
|
|
subic $len,$len,128 # $len-=128
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
vncipher $out6,$out6,v24
|
|
vncipher $out7,$out7,v24
|
|
|
|
subfe. r0,r0,r0 # borrow?-1:0
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
vncipher $out6,$out6,v25
|
|
vncipher $out7,$out7,v25
|
|
|
|
and r0,r0,$len
|
|
vncipher $out0,$out0,v26
|
|
vncipher $out1,$out1,v26
|
|
vncipher $out2,$out2,v26
|
|
vncipher $out3,$out3,v26
|
|
vncipher $out4,$out4,v26
|
|
vncipher $out5,$out5,v26
|
|
vncipher $out6,$out6,v26
|
|
vncipher $out7,$out7,v26
|
|
|
|
add $inp,$inp,r0 # $inp is adjusted in such
|
|
# way that at exit from the
|
|
# loop inX-in7 are loaded
|
|
# with last "words"
|
|
vncipher $out0,$out0,v27
|
|
vncipher $out1,$out1,v27
|
|
vncipher $out2,$out2,v27
|
|
vncipher $out3,$out3,v27
|
|
vncipher $out4,$out4,v27
|
|
vncipher $out5,$out5,v27
|
|
vncipher $out6,$out6,v27
|
|
vncipher $out7,$out7,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vncipher $out0,$out0,v28
|
|
vncipher $out1,$out1,v28
|
|
vncipher $out2,$out2,v28
|
|
vncipher $out3,$out3,v28
|
|
vncipher $out4,$out4,v28
|
|
vncipher $out5,$out5,v28
|
|
vncipher $out6,$out6,v28
|
|
vncipher $out7,$out7,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
|
|
vncipher $out0,$out0,v29
|
|
vncipher $out1,$out1,v29
|
|
vncipher $out2,$out2,v29
|
|
vncipher $out3,$out3,v29
|
|
vncipher $out4,$out4,v29
|
|
vncipher $out5,$out5,v29
|
|
vncipher $out6,$out6,v29
|
|
vncipher $out7,$out7,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
|
|
vncipher $out0,$out0,v30
|
|
vxor $ivec,$ivec,v31 # xor with last round key
|
|
vncipher $out1,$out1,v30
|
|
vxor $in0,$in0,v31
|
|
vncipher $out2,$out2,v30
|
|
vxor $in1,$in1,v31
|
|
vncipher $out3,$out3,v30
|
|
vxor $in2,$in2,v31
|
|
vncipher $out4,$out4,v30
|
|
vxor $in3,$in3,v31
|
|
vncipher $out5,$out5,v30
|
|
vxor $in4,$in4,v31
|
|
vncipher $out6,$out6,v30
|
|
vxor $in5,$in5,v31
|
|
vncipher $out7,$out7,v30
|
|
vxor $in6,$in6,v31
|
|
|
|
vncipherlast $out0,$out0,$ivec
|
|
vncipherlast $out1,$out1,$in0
|
|
lvx_u $in0,$x00,$inp # load next input block
|
|
vncipherlast $out2,$out2,$in1
|
|
lvx_u $in1,$x10,$inp
|
|
vncipherlast $out3,$out3,$in2
|
|
le?vperm $in0,$in0,$in0,$inpperm
|
|
lvx_u $in2,$x20,$inp
|
|
vncipherlast $out4,$out4,$in3
|
|
le?vperm $in1,$in1,$in1,$inpperm
|
|
lvx_u $in3,$x30,$inp
|
|
vncipherlast $out5,$out5,$in4
|
|
le?vperm $in2,$in2,$in2,$inpperm
|
|
lvx_u $in4,$x40,$inp
|
|
vncipherlast $out6,$out6,$in5
|
|
le?vperm $in3,$in3,$in3,$inpperm
|
|
lvx_u $in5,$x50,$inp
|
|
vncipherlast $out7,$out7,$in6
|
|
le?vperm $in4,$in4,$in4,$inpperm
|
|
lvx_u $in6,$x60,$inp
|
|
vmr $ivec,$in7
|
|
le?vperm $in5,$in5,$in5,$inpperm
|
|
lvx_u $in7,$x70,$inp
|
|
addi $inp,$inp,0x80
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $in6,$in6,$in6,$inpperm
|
|
vxor $out0,$in0,$rndkey0
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $in7,$in7,$in7,$inpperm
|
|
vxor $out1,$in1,$rndkey0
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
vxor $out2,$in2,$rndkey0
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x30,$out
|
|
vxor $out3,$in3,$rndkey0
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x40,$out
|
|
vxor $out4,$in4,$rndkey0
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x50,$out
|
|
vxor $out5,$in5,$rndkey0
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x60,$out
|
|
vxor $out6,$in6,$rndkey0
|
|
stvx_u $out7,$x70,$out
|
|
addi $out,$out,0x80
|
|
vxor $out7,$in7,$rndkey0
|
|
|
|
mtctr $rounds
|
|
beq Loop_cbc_dec8x # did $len-=128 borrow?
|
|
|
|
addic. $len,$len,128
|
|
beq Lcbc_dec8x_done
|
|
nop
|
|
nop
|
|
|
|
Loop_cbc_dec8x_tail: # up to 7 "words" tail...
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
vncipher $out6,$out6,v24
|
|
vncipher $out7,$out7,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
vncipher $out6,$out6,v25
|
|
vncipher $out7,$out7,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_cbc_dec8x_tail
|
|
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
vncipher $out6,$out6,v24
|
|
vncipher $out7,$out7,v24
|
|
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
vncipher $out6,$out6,v25
|
|
vncipher $out7,$out7,v25
|
|
|
|
vncipher $out1,$out1,v26
|
|
vncipher $out2,$out2,v26
|
|
vncipher $out3,$out3,v26
|
|
vncipher $out4,$out4,v26
|
|
vncipher $out5,$out5,v26
|
|
vncipher $out6,$out6,v26
|
|
vncipher $out7,$out7,v26
|
|
|
|
vncipher $out1,$out1,v27
|
|
vncipher $out2,$out2,v27
|
|
vncipher $out3,$out3,v27
|
|
vncipher $out4,$out4,v27
|
|
vncipher $out5,$out5,v27
|
|
vncipher $out6,$out6,v27
|
|
vncipher $out7,$out7,v27
|
|
|
|
vncipher $out1,$out1,v28
|
|
vncipher $out2,$out2,v28
|
|
vncipher $out3,$out3,v28
|
|
vncipher $out4,$out4,v28
|
|
vncipher $out5,$out5,v28
|
|
vncipher $out6,$out6,v28
|
|
vncipher $out7,$out7,v28
|
|
|
|
vncipher $out1,$out1,v29
|
|
vncipher $out2,$out2,v29
|
|
vncipher $out3,$out3,v29
|
|
vncipher $out4,$out4,v29
|
|
vncipher $out5,$out5,v29
|
|
vncipher $out6,$out6,v29
|
|
vncipher $out7,$out7,v29
|
|
|
|
vncipher $out1,$out1,v30
|
|
vxor $ivec,$ivec,v31 # last round key
|
|
vncipher $out2,$out2,v30
|
|
vxor $in1,$in1,v31
|
|
vncipher $out3,$out3,v30
|
|
vxor $in2,$in2,v31
|
|
vncipher $out4,$out4,v30
|
|
vxor $in3,$in3,v31
|
|
vncipher $out5,$out5,v30
|
|
vxor $in4,$in4,v31
|
|
vncipher $out6,$out6,v30
|
|
vxor $in5,$in5,v31
|
|
vncipher $out7,$out7,v30
|
|
vxor $in6,$in6,v31
|
|
|
|
cmplwi $len,32 # switch($len)
|
|
blt Lcbc_dec8x_one
|
|
nop
|
|
beq Lcbc_dec8x_two
|
|
cmplwi $len,64
|
|
blt Lcbc_dec8x_three
|
|
nop
|
|
beq Lcbc_dec8x_four
|
|
cmplwi $len,96
|
|
blt Lcbc_dec8x_five
|
|
nop
|
|
beq Lcbc_dec8x_six
|
|
|
|
Lcbc_dec8x_seven:
|
|
vncipherlast $out1,$out1,$ivec
|
|
vncipherlast $out2,$out2,$in1
|
|
vncipherlast $out3,$out3,$in2
|
|
vncipherlast $out4,$out4,$in3
|
|
vncipherlast $out5,$out5,$in4
|
|
vncipherlast $out6,$out6,$in5
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x00,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x10,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x20,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x30,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x40,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x50,$out
|
|
stvx_u $out7,$x60,$out
|
|
addi $out,$out,0x70
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_six:
|
|
vncipherlast $out2,$out2,$ivec
|
|
vncipherlast $out3,$out3,$in2
|
|
vncipherlast $out4,$out4,$in3
|
|
vncipherlast $out5,$out5,$in4
|
|
vncipherlast $out6,$out6,$in5
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x00,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x10,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x20,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x30,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x40,$out
|
|
stvx_u $out7,$x50,$out
|
|
addi $out,$out,0x60
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_five:
|
|
vncipherlast $out3,$out3,$ivec
|
|
vncipherlast $out4,$out4,$in3
|
|
vncipherlast $out5,$out5,$in4
|
|
vncipherlast $out6,$out6,$in5
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x00,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x10,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x20,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x30,$out
|
|
stvx_u $out7,$x40,$out
|
|
addi $out,$out,0x50
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_four:
|
|
vncipherlast $out4,$out4,$ivec
|
|
vncipherlast $out5,$out5,$in4
|
|
vncipherlast $out6,$out6,$in5
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x00,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x10,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x20,$out
|
|
stvx_u $out7,$x30,$out
|
|
addi $out,$out,0x40
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_three:
|
|
vncipherlast $out5,$out5,$ivec
|
|
vncipherlast $out6,$out6,$in5
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x00,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x10,$out
|
|
stvx_u $out7,$x20,$out
|
|
addi $out,$out,0x30
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_two:
|
|
vncipherlast $out6,$out6,$ivec
|
|
vncipherlast $out7,$out7,$in6
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x00,$out
|
|
stvx_u $out7,$x10,$out
|
|
addi $out,$out,0x20
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lcbc_dec8x_one:
|
|
vncipherlast $out7,$out7,$ivec
|
|
vmr $ivec,$in7
|
|
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out7,0,$out
|
|
addi $out,$out,0x10
|
|
|
|
Lcbc_dec8x_done:
|
|
le?vperm $ivec,$ivec,$ivec,$inpperm
|
|
stvx_u $ivec,0,$ivp # write [unaligned] iv
|
|
|
|
li r10,`$FRAME+15`
|
|
li r11,`$FRAME+31`
|
|
stvx $inpperm,r10,$sp # wipe copies of round keys
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
|
|
mtspr 256,$vrsave
|
|
lvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
lvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v30,r10,$sp
|
|
lvx v31,r11,$sp
|
|
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0x80,6,6,0
|
|
.long 0
|
|
.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
|
|
___
|
|
}} }}}
|
|
|
|
#########################################################################
|
|
{{{ # CTR procedure[s] #
|
|
my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
|
|
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
|
|
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
|
|
map("v$_",(4..11));
|
|
my $dat=$tmp;
|
|
|
|
$code.=<<___;
|
|
.globl .${prefix}_ctr32_encrypt_blocks
|
|
${UCMP}i $len,1
|
|
bltlr-
|
|
|
|
lis r0,0xfff0
|
|
mfspr $vrsave,256
|
|
mtspr 256,r0
|
|
|
|
li $idx,15
|
|
vxor $rndkey0,$rndkey0,$rndkey0
|
|
le?vspltisb $tmp,0x0f
|
|
|
|
lvx $ivec,0,$ivp # load [unaligned] iv
|
|
lvsl $inpperm,0,$ivp
|
|
lvx $inptail,$idx,$ivp
|
|
vspltisb $one,1
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
vperm $ivec,$ivec,$inptail,$inpperm
|
|
vsldoi $one,$rndkey0,$one,1
|
|
|
|
neg r11,$inp
|
|
?lvsl $keyperm,0,$key # prepare for unaligned key
|
|
lwz $rounds,240($key)
|
|
|
|
lvsr $inpperm,0,r11 # prepare for unaligned load
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,15 # 15 is not typo
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
|
|
srwi $rounds,$rounds,1
|
|
li $idx,16
|
|
subi $rounds,$rounds,1
|
|
|
|
${UCMP}i $len,8
|
|
bge _aesp8_ctr32_encrypt8x
|
|
|
|
?lvsr $outperm,0,$out # prepare for unaligned store
|
|
vspltisb $outmask,-1
|
|
lvx $outhead,0,$out
|
|
?vperm $outmask,$rndkey0,$outmask,$outperm
|
|
le?vxor $outperm,$outperm,$tmp
|
|
|
|
lvx $rndkey0,0,$key
|
|
mtctr $rounds
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$ivec,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
b Loop_ctr32_enc
|
|
|
|
.align 5
|
|
Loop_ctr32_enc:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
bdnz Loop_ctr32_enc
|
|
|
|
vadduwm $ivec,$ivec,$one
|
|
vmr $dat,$inptail
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
subic. $len,$len,1 # blocks--
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key
|
|
vperm $dat,$dat,$inptail,$inpperm
|
|
li $idx,16
|
|
?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
|
|
lvx $rndkey0,0,$key
|
|
vxor $dat,$dat,$rndkey1 # last round key
|
|
vcipherlast $inout,$inout,$dat
|
|
|
|
lvx $rndkey1,$idx,$key
|
|
addi $idx,$idx,16
|
|
vperm $inout,$inout,$inout,$outperm
|
|
vsel $dat,$outhead,$inout,$outmask
|
|
mtctr $rounds
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vmr $outhead,$inout
|
|
vxor $inout,$ivec,$rndkey0
|
|
lvx $rndkey0,$idx,$key
|
|
addi $idx,$idx,16
|
|
stvx $dat,0,$out
|
|
addi $out,$out,16
|
|
bne Loop_ctr32_enc
|
|
|
|
addi $out,$out,-1
|
|
lvx $inout,0,$out # redundant in aligned case
|
|
vsel $inout,$outhead,$inout,$outmask
|
|
stvx $inout,0,$out
|
|
|
|
mtspr 256,$vrsave
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,6,0
|
|
.long 0
|
|
___
|
|
#########################################################################
|
|
{{ # Optimized CTR procedure #
|
|
my $key_="r11";
|
|
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
|
|
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
|
|
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
|
|
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
|
|
# v26-v31 last 6 round keys
|
|
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
|
|
my ($two,$three,$four)=($outhead,$outperm,$outmask);
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
_aesp8_ctr32_encrypt8x:
|
|
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
|
|
li r10,`$FRAME+8*16+15`
|
|
li r11,`$FRAME+8*16+31`
|
|
stvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
stvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
stvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
stvx v30,r10,$sp
|
|
stvx v31,r11,$sp
|
|
li r0,-1
|
|
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
|
|
li $x10,0x10
|
|
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
li $x20,0x20
|
|
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
li $x30,0x30
|
|
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
li $x40,0x40
|
|
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
li $x50,0x50
|
|
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
li $x60,0x60
|
|
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
li $x70,0x70
|
|
mtspr 256,r0
|
|
|
|
subi $rounds,$rounds,3 # -4 in total
|
|
|
|
lvx $rndkey0,$x00,$key # load key schedule
|
|
lvx v30,$x10,$key
|
|
addi $key,$key,0x20
|
|
lvx v31,$x00,$key
|
|
?vperm $rndkey0,$rndkey0,v30,$keyperm
|
|
addi $key_,$sp,$FRAME+15
|
|
mtctr $rounds
|
|
|
|
Load_ctr32_enc_key:
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v30,$x10,$key
|
|
addi $key,$key,0x20
|
|
stvx v24,$x00,$key_ # off-load round[1]
|
|
?vperm v25,v31,v30,$keyperm
|
|
lvx v31,$x00,$key
|
|
stvx v25,$x10,$key_ # off-load round[2]
|
|
addi $key_,$key_,0x20
|
|
bdnz Load_ctr32_enc_key
|
|
|
|
lvx v26,$x10,$key
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v27,$x20,$key
|
|
stvx v24,$x00,$key_ # off-load round[3]
|
|
?vperm v25,v31,v26,$keyperm
|
|
lvx v28,$x30,$key
|
|
stvx v25,$x10,$key_ # off-load round[4]
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
?vperm v26,v26,v27,$keyperm
|
|
lvx v29,$x40,$key
|
|
?vperm v27,v27,v28,$keyperm
|
|
lvx v30,$x50,$key
|
|
?vperm v28,v28,v29,$keyperm
|
|
lvx v31,$x60,$key
|
|
?vperm v29,v29,v30,$keyperm
|
|
lvx $out0,$x70,$key # borrow $out0
|
|
?vperm v30,v30,v31,$keyperm
|
|
lvx v24,$x00,$key_ # pre-load round[1]
|
|
?vperm v31,v31,$out0,$keyperm
|
|
lvx v25,$x10,$key_ # pre-load round[2]
|
|
|
|
vadduqm $two,$one,$one
|
|
subi $inp,$inp,15 # undo "caller"
|
|
$SHL $len,$len,4
|
|
|
|
vadduqm $out1,$ivec,$one # counter values ...
|
|
vadduqm $out2,$ivec,$two
|
|
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
|
|
le?li $idx,8
|
|
vadduqm $out3,$out1,$two
|
|
vxor $out1,$out1,$rndkey0
|
|
le?lvsl $inpperm,0,$idx
|
|
vadduqm $out4,$out2,$two
|
|
vxor $out2,$out2,$rndkey0
|
|
le?vspltisb $tmp,0x0f
|
|
vadduqm $out5,$out3,$two
|
|
vxor $out3,$out3,$rndkey0
|
|
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
|
|
vadduqm $out6,$out4,$two
|
|
vxor $out4,$out4,$rndkey0
|
|
vadduqm $out7,$out5,$two
|
|
vxor $out5,$out5,$rndkey0
|
|
vadduqm $ivec,$out6,$two # next counter value
|
|
vxor $out6,$out6,$rndkey0
|
|
vxor $out7,$out7,$rndkey0
|
|
|
|
mtctr $rounds
|
|
b Loop_ctr32_enc8x
|
|
.align 5
|
|
Loop_ctr32_enc8x:
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vcipher $out4,$out4,v24
|
|
vcipher $out5,$out5,v24
|
|
vcipher $out6,$out6,v24
|
|
vcipher $out7,$out7,v24
|
|
Loop_ctr32_enc8x_middle:
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vcipher $out4,$out4,v25
|
|
vcipher $out5,$out5,v25
|
|
vcipher $out6,$out6,v25
|
|
vcipher $out7,$out7,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_ctr32_enc8x
|
|
|
|
subic r11,$len,256 # $len-256, borrow $key_
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vcipher $out4,$out4,v24
|
|
vcipher $out5,$out5,v24
|
|
vcipher $out6,$out6,v24
|
|
vcipher $out7,$out7,v24
|
|
|
|
subfe r0,r0,r0 # borrow?-1:0
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vcipher $out4,$out4,v25
|
|
vcipher $out5,$out5,v25
|
|
vcipher $out6,$out6,v25
|
|
vcipher $out7,$out7,v25
|
|
|
|
and r0,r0,r11
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vcipher $out0,$out0,v26
|
|
vcipher $out1,$out1,v26
|
|
vcipher $out2,$out2,v26
|
|
vcipher $out3,$out3,v26
|
|
vcipher $out4,$out4,v26
|
|
vcipher $out5,$out5,v26
|
|
vcipher $out6,$out6,v26
|
|
vcipher $out7,$out7,v26
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
|
|
subic $len,$len,129 # $len-=129
|
|
vcipher $out0,$out0,v27
|
|
addi $len,$len,1 # $len-=128 really
|
|
vcipher $out1,$out1,v27
|
|
vcipher $out2,$out2,v27
|
|
vcipher $out3,$out3,v27
|
|
vcipher $out4,$out4,v27
|
|
vcipher $out5,$out5,v27
|
|
vcipher $out6,$out6,v27
|
|
vcipher $out7,$out7,v27
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
|
|
vcipher $out0,$out0,v28
|
|
lvx_u $in0,$x00,$inp # load input
|
|
vcipher $out1,$out1,v28
|
|
lvx_u $in1,$x10,$inp
|
|
vcipher $out2,$out2,v28
|
|
lvx_u $in2,$x20,$inp
|
|
vcipher $out3,$out3,v28
|
|
lvx_u $in3,$x30,$inp
|
|
vcipher $out4,$out4,v28
|
|
lvx_u $in4,$x40,$inp
|
|
vcipher $out5,$out5,v28
|
|
lvx_u $in5,$x50,$inp
|
|
vcipher $out6,$out6,v28
|
|
lvx_u $in6,$x60,$inp
|
|
vcipher $out7,$out7,v28
|
|
lvx_u $in7,$x70,$inp
|
|
addi $inp,$inp,0x80
|
|
|
|
vcipher $out0,$out0,v29
|
|
le?vperm $in0,$in0,$in0,$inpperm
|
|
vcipher $out1,$out1,v29
|
|
le?vperm $in1,$in1,$in1,$inpperm
|
|
vcipher $out2,$out2,v29
|
|
le?vperm $in2,$in2,$in2,$inpperm
|
|
vcipher $out3,$out3,v29
|
|
le?vperm $in3,$in3,$in3,$inpperm
|
|
vcipher $out4,$out4,v29
|
|
le?vperm $in4,$in4,$in4,$inpperm
|
|
vcipher $out5,$out5,v29
|
|
le?vperm $in5,$in5,$in5,$inpperm
|
|
vcipher $out6,$out6,v29
|
|
le?vperm $in6,$in6,$in6,$inpperm
|
|
vcipher $out7,$out7,v29
|
|
le?vperm $in7,$in7,$in7,$inpperm
|
|
|
|
add $inp,$inp,r0 # $inp is adjusted in such
|
|
# way that at exit from the
|
|
# loop inX-in7 are loaded
|
|
# with last "words"
|
|
subfe. r0,r0,r0 # borrow?-1:0
|
|
vcipher $out0,$out0,v30
|
|
vxor $in0,$in0,v31 # xor with last round key
|
|
vcipher $out1,$out1,v30
|
|
vxor $in1,$in1,v31
|
|
vcipher $out2,$out2,v30
|
|
vxor $in2,$in2,v31
|
|
vcipher $out3,$out3,v30
|
|
vxor $in3,$in3,v31
|
|
vcipher $out4,$out4,v30
|
|
vxor $in4,$in4,v31
|
|
vcipher $out5,$out5,v30
|
|
vxor $in5,$in5,v31
|
|
vcipher $out6,$out6,v30
|
|
vxor $in6,$in6,v31
|
|
vcipher $out7,$out7,v30
|
|
vxor $in7,$in7,v31
|
|
|
|
bne Lctr32_enc8x_break # did $len-129 borrow?
|
|
|
|
vcipherlast $in0,$out0,$in0
|
|
vcipherlast $in1,$out1,$in1
|
|
vadduqm $out1,$ivec,$one # counter values ...
|
|
vcipherlast $in2,$out2,$in2
|
|
vadduqm $out2,$ivec,$two
|
|
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
|
|
vcipherlast $in3,$out3,$in3
|
|
vadduqm $out3,$out1,$two
|
|
vxor $out1,$out1,$rndkey0
|
|
vcipherlast $in4,$out4,$in4
|
|
vadduqm $out4,$out2,$two
|
|
vxor $out2,$out2,$rndkey0
|
|
vcipherlast $in5,$out5,$in5
|
|
vadduqm $out5,$out3,$two
|
|
vxor $out3,$out3,$rndkey0
|
|
vcipherlast $in6,$out6,$in6
|
|
vadduqm $out6,$out4,$two
|
|
vxor $out4,$out4,$rndkey0
|
|
vcipherlast $in7,$out7,$in7
|
|
vadduqm $out7,$out5,$two
|
|
vxor $out5,$out5,$rndkey0
|
|
le?vperm $in0,$in0,$in0,$inpperm
|
|
vadduqm $ivec,$out6,$two # next counter value
|
|
vxor $out6,$out6,$rndkey0
|
|
le?vperm $in1,$in1,$in1,$inpperm
|
|
vxor $out7,$out7,$rndkey0
|
|
mtctr $rounds
|
|
|
|
vcipher $out0,$out0,v24
|
|
stvx_u $in0,$x00,$out
|
|
le?vperm $in2,$in2,$in2,$inpperm
|
|
vcipher $out1,$out1,v24
|
|
stvx_u $in1,$x10,$out
|
|
le?vperm $in3,$in3,$in3,$inpperm
|
|
vcipher $out2,$out2,v24
|
|
stvx_u $in2,$x20,$out
|
|
le?vperm $in4,$in4,$in4,$inpperm
|
|
vcipher $out3,$out3,v24
|
|
stvx_u $in3,$x30,$out
|
|
le?vperm $in5,$in5,$in5,$inpperm
|
|
vcipher $out4,$out4,v24
|
|
stvx_u $in4,$x40,$out
|
|
le?vperm $in6,$in6,$in6,$inpperm
|
|
vcipher $out5,$out5,v24
|
|
stvx_u $in5,$x50,$out
|
|
le?vperm $in7,$in7,$in7,$inpperm
|
|
vcipher $out6,$out6,v24
|
|
stvx_u $in6,$x60,$out
|
|
vcipher $out7,$out7,v24
|
|
stvx_u $in7,$x70,$out
|
|
addi $out,$out,0x80
|
|
|
|
b Loop_ctr32_enc8x_middle
|
|
|
|
.align 5
|
|
Lctr32_enc8x_break:
|
|
cmpwi $len,-0x60
|
|
blt Lctr32_enc8x_one
|
|
nop
|
|
beq Lctr32_enc8x_two
|
|
cmpwi $len,-0x40
|
|
blt Lctr32_enc8x_three
|
|
nop
|
|
beq Lctr32_enc8x_four
|
|
cmpwi $len,-0x20
|
|
blt Lctr32_enc8x_five
|
|
nop
|
|
beq Lctr32_enc8x_six
|
|
cmpwi $len,0x00
|
|
blt Lctr32_enc8x_seven
|
|
|
|
Lctr32_enc8x_eight:
|
|
vcipherlast $out0,$out0,$in0
|
|
vcipherlast $out1,$out1,$in1
|
|
vcipherlast $out2,$out2,$in2
|
|
vcipherlast $out3,$out3,$in3
|
|
vcipherlast $out4,$out4,$in4
|
|
vcipherlast $out5,$out5,$in5
|
|
vcipherlast $out6,$out6,$in6
|
|
vcipherlast $out7,$out7,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x30,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x40,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x50,$out
|
|
le?vperm $out7,$out7,$out7,$inpperm
|
|
stvx_u $out6,$x60,$out
|
|
stvx_u $out7,$x70,$out
|
|
addi $out,$out,0x80
|
|
b Lctr32_enc8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_seven:
|
|
vcipherlast $out0,$out0,$in1
|
|
vcipherlast $out1,$out1,$in2
|
|
vcipherlast $out2,$out2,$in3
|
|
vcipherlast $out3,$out3,$in4
|
|
vcipherlast $out4,$out4,$in5
|
|
vcipherlast $out5,$out5,$in6
|
|
vcipherlast $out6,$out6,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x30,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x40,$out
|
|
le?vperm $out6,$out6,$out6,$inpperm
|
|
stvx_u $out5,$x50,$out
|
|
stvx_u $out6,$x60,$out
|
|
addi $out,$out,0x70
|
|
b Lctr32_enc8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_six:
|
|
vcipherlast $out0,$out0,$in2
|
|
vcipherlast $out1,$out1,$in3
|
|
vcipherlast $out2,$out2,$in4
|
|
vcipherlast $out3,$out3,$in5
|
|
vcipherlast $out4,$out4,$in6
|
|
vcipherlast $out5,$out5,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x30,$out
|
|
le?vperm $out5,$out5,$out5,$inpperm
|
|
stvx_u $out4,$x40,$out
|
|
stvx_u $out5,$x50,$out
|
|
addi $out,$out,0x60
|
|
b Lctr32_enc8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_five:
|
|
vcipherlast $out0,$out0,$in3
|
|
vcipherlast $out1,$out1,$in4
|
|
vcipherlast $out2,$out2,$in5
|
|
vcipherlast $out3,$out3,$in6
|
|
vcipherlast $out4,$out4,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
le?vperm $out4,$out4,$out4,$inpperm
|
|
stvx_u $out3,$x30,$out
|
|
stvx_u $out4,$x40,$out
|
|
addi $out,$out,0x50
|
|
b Lctr32_enc8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_four:
|
|
vcipherlast $out0,$out0,$in4
|
|
vcipherlast $out1,$out1,$in5
|
|
vcipherlast $out2,$out2,$in6
|
|
vcipherlast $out3,$out3,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$inpperm
|
|
stvx_u $out2,$x20,$out
|
|
stvx_u $out3,$x30,$out
|
|
addi $out,$out,0x40
|
|
b Lctr32_enc8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_three:
|
|
vcipherlast $out0,$out0,$in5
|
|
vcipherlast $out1,$out1,$in6
|
|
vcipherlast $out2,$out2,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
le?vperm $out2,$out2,$out2,$inpperm
|
|
stvx_u $out1,$x10,$out
|
|
stvx_u $out2,$x20,$out
|
|
addi $out,$out,0x30
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_two:
|
|
vcipherlast $out0,$out0,$in6
|
|
vcipherlast $out1,$out1,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
le?vperm $out1,$out1,$out1,$inpperm
|
|
stvx_u $out0,$x00,$out
|
|
stvx_u $out1,$x10,$out
|
|
addi $out,$out,0x20
|
|
b Lcbc_dec8x_done
|
|
|
|
.align 5
|
|
Lctr32_enc8x_one:
|
|
vcipherlast $out0,$out0,$in7
|
|
|
|
le?vperm $out0,$out0,$out0,$inpperm
|
|
stvx_u $out0,0,$out
|
|
addi $out,$out,0x10
|
|
|
|
Lctr32_enc8x_done:
|
|
li r10,`$FRAME+15`
|
|
li r11,`$FRAME+31`
|
|
stvx $inpperm,r10,$sp # wipe copies of round keys
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $inpperm,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $inpperm,r11,$sp
|
|
addi r11,r11,32
|
|
|
|
mtspr 256,$vrsave
|
|
lvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
lvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v30,r10,$sp
|
|
lvx v31,r11,$sp
|
|
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0x80,6,6,0
|
|
.long 0
|
|
.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
|
|
___
|
|
}} }}}
|
|
|
|
#########################################################################
|
|
{{{ # XTS procedures #
|
|
# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, #
|
|
# const AES_KEY *key1, const AES_KEY *key2, #
|
|
# [const] unsigned char iv[16]); #
|
|
# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which #
|
|
# input tweak value is assumed to be encrypted already, and last tweak #
|
|
# value, one suitable for consecutive call on same chunk of data, is #
|
|
# written back to original buffer. In addition, in "tweak chaining" #
|
|
# mode only complete input blocks are processed. #
|
|
|
|
my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
|
|
my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
|
|
my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
|
|
my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
|
|
my $taillen = $key2;
|
|
|
|
($inp,$idx) = ($idx,$inp); # reassign
|
|
|
|
$code.=<<___;
|
|
.globl .${prefix}_xts_encrypt
|
|
mr $inp,r3 # reassign
|
|
li r3,-1
|
|
${UCMP}i $len,16
|
|
bltlr-
|
|
|
|
lis r0,0xfff0
|
|
mfspr r12,256 # save vrsave
|
|
li r11,0
|
|
mtspr 256,r0
|
|
|
|
vspltisb $seven,0x07 # 0x070707..07
|
|
le?lvsl $leperm,r11,r11
|
|
le?vspltisb $tmp,0x0f
|
|
le?vxor $leperm,$leperm,$seven
|
|
|
|
li $idx,15
|
|
lvx $tweak,0,$ivp # load [unaligned] iv
|
|
lvsl $inpperm,0,$ivp
|
|
lvx $inptail,$idx,$ivp
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
vperm $tweak,$tweak,$inptail,$inpperm
|
|
|
|
neg r11,$inp
|
|
lvsr $inpperm,0,r11 # prepare for unaligned load
|
|
lvx $inout,0,$inp
|
|
addi $inp,$inp,15 # 15 is not typo
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
|
|
${UCMP}i $key2,0 # key2==NULL?
|
|
beq Lxts_enc_no_key2
|
|
|
|
?lvsl $keyperm,0,$key2 # prepare for unaligned key
|
|
lwz $rounds,240($key2)
|
|
srwi $rounds,$rounds,1
|
|
subi $rounds,$rounds,1
|
|
li $idx,16
|
|
|
|
lvx $rndkey0,0,$key2
|
|
lvx $rndkey1,$idx,$key2
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $tweak,$tweak,$rndkey0
|
|
lvx $rndkey0,$idx,$key2
|
|
addi $idx,$idx,16
|
|
mtctr $rounds
|
|
|
|
Ltweak_xts_enc:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey1
|
|
lvx $rndkey1,$idx,$key2
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey0
|
|
lvx $rndkey0,$idx,$key2
|
|
addi $idx,$idx,16
|
|
bdnz Ltweak_xts_enc
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey1
|
|
lvx $rndkey1,$idx,$key2
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipherlast $tweak,$tweak,$rndkey0
|
|
|
|
li $ivp,0 # don't chain the tweak
|
|
b Lxts_enc
|
|
|
|
Lxts_enc_no_key2:
|
|
li $idx,-16
|
|
and $len,$len,$idx # in "tweak chaining"
|
|
# mode only complete
|
|
# blocks are processed
|
|
Lxts_enc:
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
|
|
?lvsl $keyperm,0,$key1 # prepare for unaligned key
|
|
lwz $rounds,240($key1)
|
|
srwi $rounds,$rounds,1
|
|
subi $rounds,$rounds,1
|
|
li $idx,16
|
|
|
|
vslb $eighty7,$seven,$seven # 0x808080..80
|
|
vor $eighty7,$eighty7,$seven # 0x878787..87
|
|
vspltisb $tmp,1 # 0x010101..01
|
|
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
|
|
|
|
${UCMP}i $len,96
|
|
bge _aesp8_xts_encrypt6x
|
|
|
|
andi. $taillen,$len,15
|
|
subic r0,$len,32
|
|
subi $taillen,$taillen,16
|
|
subfe r0,r0,r0
|
|
and r0,r0,$taillen
|
|
add $inp,$inp,r0
|
|
|
|
lvx $rndkey0,0,$key1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$inout,$tweak
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
mtctr $rounds
|
|
b Loop_xts_enc
|
|
|
|
.align 5
|
|
Loop_xts_enc:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
bdnz Loop_xts_enc
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
li $idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $rndkey0,$rndkey0,$tweak
|
|
vcipherlast $output,$inout,$rndkey0
|
|
|
|
le?vperm $tmp,$output,$output,$leperm
|
|
be?nop
|
|
le?stvx_u $tmp,0,$out
|
|
be?stvx_u $output,0,$out
|
|
addi $out,$out,16
|
|
|
|
subic. $len,$len,16
|
|
beq Lxts_enc_done
|
|
|
|
vmr $inout,$inptail
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
lvx $rndkey0,0,$key1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
|
|
subic r0,$len,32
|
|
subfe r0,r0,r0
|
|
and r0,r0,$taillen
|
|
add $inp,$inp,r0
|
|
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$inout,$tweak
|
|
vxor $output,$output,$rndkey0 # just in case $len<16
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
|
|
mtctr $rounds
|
|
${UCMP}i $len,16
|
|
bge Loop_xts_enc
|
|
|
|
vxor $output,$output,$tweak
|
|
lvsr $inpperm,0,$len # $inpperm is no longer needed
|
|
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
|
|
vspltisb $tmp,-1
|
|
vperm $inptail,$inptail,$tmp,$inpperm
|
|
vsel $inout,$inout,$output,$inptail
|
|
|
|
subi r11,$out,17
|
|
subi $out,$out,16
|
|
mtctr $len
|
|
li $len,16
|
|
Loop_xts_enc_steal:
|
|
lbzu r0,1(r11)
|
|
stb r0,16(r11)
|
|
bdnz Loop_xts_enc_steal
|
|
|
|
mtctr $rounds
|
|
b Loop_xts_enc # one more time...
|
|
|
|
Lxts_enc_done:
|
|
${UCMP}i $ivp,0
|
|
beq Lxts_enc_ret
|
|
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
le?vperm $tweak,$tweak,$tweak,$leperm
|
|
stvx_u $tweak,0,$ivp
|
|
|
|
Lxts_enc_ret:
|
|
mtspr 256,r12 # restore vrsave
|
|
li r3,0
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x04,0,0x80,6,6,0
|
|
.long 0
|
|
.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
|
|
|
|
.globl .${prefix}_xts_decrypt
|
|
mr $inp,r3 # reassign
|
|
li r3,-1
|
|
${UCMP}i $len,16
|
|
bltlr-
|
|
|
|
lis r0,0xfff8
|
|
mfspr r12,256 # save vrsave
|
|
li r11,0
|
|
mtspr 256,r0
|
|
|
|
andi. r0,$len,15
|
|
neg r0,r0
|
|
andi. r0,r0,16
|
|
sub $len,$len,r0
|
|
|
|
vspltisb $seven,0x07 # 0x070707..07
|
|
le?lvsl $leperm,r11,r11
|
|
le?vspltisb $tmp,0x0f
|
|
le?vxor $leperm,$leperm,$seven
|
|
|
|
li $idx,15
|
|
lvx $tweak,0,$ivp # load [unaligned] iv
|
|
lvsl $inpperm,0,$ivp
|
|
lvx $inptail,$idx,$ivp
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
vperm $tweak,$tweak,$inptail,$inpperm
|
|
|
|
neg r11,$inp
|
|
lvsr $inpperm,0,r11 # prepare for unaligned load
|
|
lvx $inout,0,$inp
|
|
addi $inp,$inp,15 # 15 is not typo
|
|
le?vxor $inpperm,$inpperm,$tmp
|
|
|
|
${UCMP}i $key2,0 # key2==NULL?
|
|
beq Lxts_dec_no_key2
|
|
|
|
?lvsl $keyperm,0,$key2 # prepare for unaligned key
|
|
lwz $rounds,240($key2)
|
|
srwi $rounds,$rounds,1
|
|
subi $rounds,$rounds,1
|
|
li $idx,16
|
|
|
|
lvx $rndkey0,0,$key2
|
|
lvx $rndkey1,$idx,$key2
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $tweak,$tweak,$rndkey0
|
|
lvx $rndkey0,$idx,$key2
|
|
addi $idx,$idx,16
|
|
mtctr $rounds
|
|
|
|
Ltweak_xts_dec:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey1
|
|
lvx $rndkey1,$idx,$key2
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey0
|
|
lvx $rndkey0,$idx,$key2
|
|
addi $idx,$idx,16
|
|
bdnz Ltweak_xts_dec
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vcipher $tweak,$tweak,$rndkey1
|
|
lvx $rndkey1,$idx,$key2
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vcipherlast $tweak,$tweak,$rndkey0
|
|
|
|
li $ivp,0 # don't chain the tweak
|
|
b Lxts_dec
|
|
|
|
Lxts_dec_no_key2:
|
|
neg $idx,$len
|
|
andi. $idx,$idx,15
|
|
add $len,$len,$idx # in "tweak chaining"
|
|
# mode only complete
|
|
# blocks are processed
|
|
Lxts_dec:
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
|
|
?lvsl $keyperm,0,$key1 # prepare for unaligned key
|
|
lwz $rounds,240($key1)
|
|
srwi $rounds,$rounds,1
|
|
subi $rounds,$rounds,1
|
|
li $idx,16
|
|
|
|
vslb $eighty7,$seven,$seven # 0x808080..80
|
|
vor $eighty7,$eighty7,$seven # 0x878787..87
|
|
vspltisb $tmp,1 # 0x010101..01
|
|
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
|
|
|
|
${UCMP}i $len,96
|
|
bge _aesp8_xts_decrypt6x
|
|
|
|
lvx $rndkey0,0,$key1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$inout,$tweak
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
mtctr $rounds
|
|
|
|
${UCMP}i $len,16
|
|
blt Ltail_xts_dec
|
|
be?b Loop_xts_dec
|
|
|
|
.align 5
|
|
Loop_xts_dec:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vncipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
bdnz Loop_xts_dec
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
li $idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $rndkey0,$rndkey0,$tweak
|
|
vncipherlast $output,$inout,$rndkey0
|
|
|
|
le?vperm $tmp,$output,$output,$leperm
|
|
be?nop
|
|
le?stvx_u $tmp,0,$out
|
|
be?stvx_u $output,0,$out
|
|
addi $out,$out,16
|
|
|
|
subic. $len,$len,16
|
|
beq Lxts_dec_done
|
|
|
|
vmr $inout,$inptail
|
|
lvx $inptail,0,$inp
|
|
addi $inp,$inp,16
|
|
lvx $rndkey0,0,$key1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $inout,$inout,$tweak
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
|
|
mtctr $rounds
|
|
${UCMP}i $len,16
|
|
bge Loop_xts_dec
|
|
|
|
Ltail_xts_dec:
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak1,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $tweak1,$tweak1,$tmp
|
|
|
|
subi $inp,$inp,16
|
|
add $inp,$inp,$len
|
|
|
|
vxor $inout,$inout,$tweak # :-(
|
|
vxor $inout,$inout,$tweak1 # :-)
|
|
|
|
Loop_xts_dec_short:
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vncipher $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
bdnz Loop_xts_dec_short
|
|
|
|
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
|
|
vncipher $inout,$inout,$rndkey1
|
|
lvx $rndkey1,$idx,$key1
|
|
li $idx,16
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
vxor $rndkey0,$rndkey0,$tweak1
|
|
vncipherlast $output,$inout,$rndkey0
|
|
|
|
le?vperm $tmp,$output,$output,$leperm
|
|
be?nop
|
|
le?stvx_u $tmp,0,$out
|
|
be?stvx_u $output,0,$out
|
|
|
|
vmr $inout,$inptail
|
|
lvx $inptail,0,$inp
|
|
#addi $inp,$inp,16
|
|
lvx $rndkey0,0,$key1
|
|
lvx $rndkey1,$idx,$key1
|
|
addi $idx,$idx,16
|
|
vperm $inout,$inout,$inptail,$inpperm
|
|
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
|
|
|
|
lvsr $inpperm,0,$len # $inpperm is no longer needed
|
|
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
|
|
vspltisb $tmp,-1
|
|
vperm $inptail,$inptail,$tmp,$inpperm
|
|
vsel $inout,$inout,$output,$inptail
|
|
|
|
vxor $rndkey0,$rndkey0,$tweak
|
|
vxor $inout,$inout,$rndkey0
|
|
lvx $rndkey0,$idx,$key1
|
|
addi $idx,$idx,16
|
|
|
|
subi r11,$out,1
|
|
mtctr $len
|
|
li $len,16
|
|
Loop_xts_dec_steal:
|
|
lbzu r0,1(r11)
|
|
stb r0,16(r11)
|
|
bdnz Loop_xts_dec_steal
|
|
|
|
mtctr $rounds
|
|
b Loop_xts_dec # one more time...
|
|
|
|
Lxts_dec_done:
|
|
${UCMP}i $ivp,0
|
|
beq Lxts_dec_ret
|
|
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
le?vperm $tweak,$tweak,$tweak,$leperm
|
|
stvx_u $tweak,0,$ivp
|
|
|
|
Lxts_dec_ret:
|
|
mtspr 256,r12 # restore vrsave
|
|
li r3,0
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x04,0,0x80,6,6,0
|
|
.long 0
|
|
.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
|
|
___
|
|
#########################################################################
|
|
{{ # Optimized XTS procedures #
|
|
my $key_=$key2;
|
|
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
|
|
$x00=0 if ($flavour =~ /osx/);
|
|
my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
|
|
my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
|
|
my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
|
|
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
|
|
# v26-v31 last 6 round keys
|
|
my ($keyperm)=($out0); # aliases with "caller", redundant assignment
|
|
my $taillen=$x70;
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
_aesp8_xts_encrypt6x:
|
|
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
|
|
mflr r11
|
|
li r7,`$FRAME+8*16+15`
|
|
li r3,`$FRAME+8*16+31`
|
|
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
|
|
stvx v20,r7,$sp # ABI says so
|
|
addi r7,r7,32
|
|
stvx v21,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v22,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v23,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v24,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v25,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v26,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v27,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v28,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v29,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v30,r7,$sp
|
|
stvx v31,r3,$sp
|
|
li r0,-1
|
|
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
|
|
li $x10,0x10
|
|
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
li $x20,0x20
|
|
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
li $x30,0x30
|
|
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
li $x40,0x40
|
|
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
li $x50,0x50
|
|
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
li $x60,0x60
|
|
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
li $x70,0x70
|
|
mtspr 256,r0
|
|
|
|
subi $rounds,$rounds,3 # -4 in total
|
|
|
|
lvx $rndkey0,$x00,$key1 # load key schedule
|
|
lvx v30,$x10,$key1
|
|
addi $key1,$key1,0x20
|
|
lvx v31,$x00,$key1
|
|
?vperm $rndkey0,$rndkey0,v30,$keyperm
|
|
addi $key_,$sp,$FRAME+15
|
|
mtctr $rounds
|
|
|
|
Load_xts_enc_key:
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v30,$x10,$key1
|
|
addi $key1,$key1,0x20
|
|
stvx v24,$x00,$key_ # off-load round[1]
|
|
?vperm v25,v31,v30,$keyperm
|
|
lvx v31,$x00,$key1
|
|
stvx v25,$x10,$key_ # off-load round[2]
|
|
addi $key_,$key_,0x20
|
|
bdnz Load_xts_enc_key
|
|
|
|
lvx v26,$x10,$key1
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v27,$x20,$key1
|
|
stvx v24,$x00,$key_ # off-load round[3]
|
|
?vperm v25,v31,v26,$keyperm
|
|
lvx v28,$x30,$key1
|
|
stvx v25,$x10,$key_ # off-load round[4]
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
?vperm v26,v26,v27,$keyperm
|
|
lvx v29,$x40,$key1
|
|
?vperm v27,v27,v28,$keyperm
|
|
lvx v30,$x50,$key1
|
|
?vperm v28,v28,v29,$keyperm
|
|
lvx v31,$x60,$key1
|
|
?vperm v29,v29,v30,$keyperm
|
|
lvx $twk5,$x70,$key1 # borrow $twk5
|
|
?vperm v30,v30,v31,$keyperm
|
|
lvx v24,$x00,$key_ # pre-load round[1]
|
|
?vperm v31,v31,$twk5,$keyperm
|
|
lvx v25,$x10,$key_ # pre-load round[2]
|
|
|
|
vperm $in0,$inout,$inptail,$inpperm
|
|
subi $inp,$inp,31 # undo "caller"
|
|
vxor $twk0,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out0,$in0,$twk0
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in1,$x10,$inp
|
|
vxor $twk1,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in1,$in1,$in1,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out1,$in1,$twk1
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in2,$x20,$inp
|
|
andi. $taillen,$len,15
|
|
vxor $twk2,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in2,$in2,$in2,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out2,$in2,$twk2
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in3,$x30,$inp
|
|
sub $len,$len,$taillen
|
|
vxor $twk3,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in3,$in3,$in3,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out3,$in3,$twk3
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in4,$x40,$inp
|
|
subi $len,$len,0x60
|
|
vxor $twk4,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in4,$in4,$in4,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out4,$in4,$twk4
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in5,$x50,$inp
|
|
addi $inp,$inp,0x60
|
|
vxor $twk5,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in5,$in5,$in5,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out5,$in5,$twk5
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
vxor v31,v31,$rndkey0
|
|
mtctr $rounds
|
|
b Loop_xts_enc6x
|
|
|
|
.align 5
|
|
Loop_xts_enc6x:
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vcipher $out4,$out4,v24
|
|
vcipher $out5,$out5,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vcipher $out4,$out4,v25
|
|
vcipher $out5,$out5,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_xts_enc6x
|
|
|
|
subic $len,$len,96 # $len-=96
|
|
vxor $in0,$twk0,v31 # xor with last round key
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk0,$tweak,$rndkey0
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vcipher $out4,$out4,v24
|
|
vcipher $out5,$out5,v24
|
|
|
|
subfe. r0,r0,r0 # borrow?-1:0
|
|
vand $tmp,$tmp,$eighty7
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vxor $in1,$twk1,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk1,$tweak,$rndkey0
|
|
vcipher $out4,$out4,v25
|
|
vcipher $out5,$out5,v25
|
|
|
|
and r0,r0,$len
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vcipher $out0,$out0,v26
|
|
vcipher $out1,$out1,v26
|
|
vand $tmp,$tmp,$eighty7
|
|
vcipher $out2,$out2,v26
|
|
vcipher $out3,$out3,v26
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipher $out4,$out4,v26
|
|
vcipher $out5,$out5,v26
|
|
|
|
add $inp,$inp,r0 # $inp is adjusted in such
|
|
# way that at exit from the
|
|
# loop inX-in5 are loaded
|
|
# with last "words"
|
|
vxor $in2,$twk2,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk2,$tweak,$rndkey0
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vcipher $out0,$out0,v27
|
|
vcipher $out1,$out1,v27
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vcipher $out2,$out2,v27
|
|
vcipher $out3,$out3,v27
|
|
vand $tmp,$tmp,$eighty7
|
|
vcipher $out4,$out4,v27
|
|
vcipher $out5,$out5,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipher $out0,$out0,v28
|
|
vcipher $out1,$out1,v28
|
|
vxor $in3,$twk3,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk3,$tweak,$rndkey0
|
|
vcipher $out2,$out2,v28
|
|
vcipher $out3,$out3,v28
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vcipher $out4,$out4,v28
|
|
vcipher $out5,$out5,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
vand $tmp,$tmp,$eighty7
|
|
|
|
vcipher $out0,$out0,v29
|
|
vcipher $out1,$out1,v29
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipher $out2,$out2,v29
|
|
vcipher $out3,$out3,v29
|
|
vxor $in4,$twk4,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk4,$tweak,$rndkey0
|
|
vcipher $out4,$out4,v29
|
|
vcipher $out5,$out5,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
|
|
vcipher $out0,$out0,v30
|
|
vcipher $out1,$out1,v30
|
|
vand $tmp,$tmp,$eighty7
|
|
vcipher $out2,$out2,v30
|
|
vcipher $out3,$out3,v30
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipher $out4,$out4,v30
|
|
vcipher $out5,$out5,v30
|
|
vxor $in5,$twk5,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk5,$tweak,$rndkey0
|
|
|
|
vcipherlast $out0,$out0,$in0
|
|
lvx_u $in0,$x00,$inp # load next input block
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vcipherlast $out1,$out1,$in1
|
|
lvx_u $in1,$x10,$inp
|
|
vcipherlast $out2,$out2,$in2
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
lvx_u $in2,$x20,$inp
|
|
vand $tmp,$tmp,$eighty7
|
|
vcipherlast $out3,$out3,$in3
|
|
le?vperm $in1,$in1,$in1,$leperm
|
|
lvx_u $in3,$x30,$inp
|
|
vcipherlast $out4,$out4,$in4
|
|
le?vperm $in2,$in2,$in2,$leperm
|
|
lvx_u $in4,$x40,$inp
|
|
vxor $tweak,$tweak,$tmp
|
|
vcipherlast $tmp,$out5,$in5 # last block might be needed
|
|
# in stealing mode
|
|
le?vperm $in3,$in3,$in3,$leperm
|
|
lvx_u $in5,$x50,$inp
|
|
addi $inp,$inp,0x60
|
|
le?vperm $in4,$in4,$in4,$leperm
|
|
le?vperm $in5,$in5,$in5,$leperm
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk0
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
vxor $out1,$in1,$twk1
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
vxor $out2,$in2,$twk2
|
|
le?vperm $out4,$out4,$out4,$leperm
|
|
stvx_u $out3,$x30,$out
|
|
vxor $out3,$in3,$twk3
|
|
le?vperm $out5,$tmp,$tmp,$leperm
|
|
stvx_u $out4,$x40,$out
|
|
vxor $out4,$in4,$twk4
|
|
le?stvx_u $out5,$x50,$out
|
|
be?stvx_u $tmp, $x50,$out
|
|
vxor $out5,$in5,$twk5
|
|
addi $out,$out,0x60
|
|
|
|
mtctr $rounds
|
|
beq Loop_xts_enc6x # did $len-=96 borrow?
|
|
|
|
addic. $len,$len,0x60
|
|
beq Lxts_enc6x_zero
|
|
cmpwi $len,0x20
|
|
blt Lxts_enc6x_one
|
|
nop
|
|
beq Lxts_enc6x_two
|
|
cmpwi $len,0x40
|
|
blt Lxts_enc6x_three
|
|
nop
|
|
beq Lxts_enc6x_four
|
|
|
|
Lxts_enc6x_five:
|
|
vxor $out0,$in1,$twk0
|
|
vxor $out1,$in2,$twk1
|
|
vxor $out2,$in3,$twk2
|
|
vxor $out3,$in4,$twk3
|
|
vxor $out4,$in5,$twk4
|
|
|
|
bl _aesp8_xts_enc5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk5 # unused tweak
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
vxor $tmp,$out4,$twk5 # last block prep for stealing
|
|
le?vperm $out4,$out4,$out4,$leperm
|
|
stvx_u $out3,$x30,$out
|
|
stvx_u $out4,$x40,$out
|
|
addi $out,$out,0x50
|
|
bne Lxts_enc6x_steal
|
|
b Lxts_enc6x_done
|
|
|
|
.align 4
|
|
Lxts_enc6x_four:
|
|
vxor $out0,$in2,$twk0
|
|
vxor $out1,$in3,$twk1
|
|
vxor $out2,$in4,$twk2
|
|
vxor $out3,$in5,$twk3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_enc5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk4 # unused tweak
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
vxor $tmp,$out3,$twk4 # last block prep for stealing
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
stvx_u $out3,$x30,$out
|
|
addi $out,$out,0x40
|
|
bne Lxts_enc6x_steal
|
|
b Lxts_enc6x_done
|
|
|
|
.align 4
|
|
Lxts_enc6x_three:
|
|
vxor $out0,$in3,$twk0
|
|
vxor $out1,$in4,$twk1
|
|
vxor $out2,$in5,$twk2
|
|
vxor $out3,$out3,$out3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_enc5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk3 # unused tweak
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $tmp,$out2,$twk3 # last block prep for stealing
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
stvx_u $out2,$x20,$out
|
|
addi $out,$out,0x30
|
|
bne Lxts_enc6x_steal
|
|
b Lxts_enc6x_done
|
|
|
|
.align 4
|
|
Lxts_enc6x_two:
|
|
vxor $out0,$in4,$twk0
|
|
vxor $out1,$in5,$twk1
|
|
vxor $out2,$out2,$out2
|
|
vxor $out3,$out3,$out3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_enc5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk2 # unused tweak
|
|
vxor $tmp,$out1,$twk2 # last block prep for stealing
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
stvx_u $out1,$x10,$out
|
|
addi $out,$out,0x20
|
|
bne Lxts_enc6x_steal
|
|
b Lxts_enc6x_done
|
|
|
|
.align 4
|
|
Lxts_enc6x_one:
|
|
vxor $out0,$in5,$twk0
|
|
nop
|
|
Loop_xts_enc1x:
|
|
vcipher $out0,$out0,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vcipher $out0,$out0,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_xts_enc1x
|
|
|
|
add $inp,$inp,$taillen
|
|
cmpwi $taillen,0
|
|
vcipher $out0,$out0,v24
|
|
|
|
subi $inp,$inp,16
|
|
vcipher $out0,$out0,v25
|
|
|
|
lvsr $inpperm,0,$taillen
|
|
vcipher $out0,$out0,v26
|
|
|
|
lvx_u $in0,0,$inp
|
|
vcipher $out0,$out0,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vcipher $out0,$out0,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
|
|
vcipher $out0,$out0,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vxor $twk0,$twk0,v31
|
|
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vcipher $out0,$out0,v30
|
|
|
|
vperm $in0,$in0,$in0,$inpperm
|
|
vcipherlast $out0,$out0,$twk0
|
|
|
|
vmr $twk0,$twk1 # unused tweak
|
|
vxor $tmp,$out0,$twk1 # last block prep for stealing
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
addi $out,$out,0x10
|
|
bne Lxts_enc6x_steal
|
|
b Lxts_enc6x_done
|
|
|
|
.align 4
|
|
Lxts_enc6x_zero:
|
|
cmpwi $taillen,0
|
|
beq Lxts_enc6x_done
|
|
|
|
add $inp,$inp,$taillen
|
|
subi $inp,$inp,16
|
|
lvx_u $in0,0,$inp
|
|
lvsr $inpperm,0,$taillen # $in5 is no more
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vperm $in0,$in0,$in0,$inpperm
|
|
vxor $tmp,$tmp,$twk0
|
|
Lxts_enc6x_steal:
|
|
vxor $in0,$in0,$twk0
|
|
vxor $out0,$out0,$out0
|
|
vspltisb $out1,-1
|
|
vperm $out0,$out0,$out1,$inpperm
|
|
vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
|
|
|
|
subi r30,$out,17
|
|
subi $out,$out,16
|
|
mtctr $taillen
|
|
Loop_xts_enc6x_steal:
|
|
lbzu r0,1(r30)
|
|
stb r0,16(r30)
|
|
bdnz Loop_xts_enc6x_steal
|
|
|
|
li $taillen,0
|
|
mtctr $rounds
|
|
b Loop_xts_enc1x # one more time...
|
|
|
|
.align 4
|
|
Lxts_enc6x_done:
|
|
${UCMP}i $ivp,0
|
|
beq Lxts_enc6x_ret
|
|
|
|
vxor $tweak,$twk0,$rndkey0
|
|
le?vperm $tweak,$tweak,$tweak,$leperm
|
|
stvx_u $tweak,0,$ivp
|
|
|
|
Lxts_enc6x_ret:
|
|
mtlr r11
|
|
li r10,`$FRAME+15`
|
|
li r11,`$FRAME+31`
|
|
stvx $seven,r10,$sp # wipe copies of round keys
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
|
|
mtspr 256,$vrsave
|
|
lvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
lvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v30,r10,$sp
|
|
lvx v31,r11,$sp
|
|
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x04,1,0x80,6,6,0
|
|
.long 0
|
|
|
|
.align 5
|
|
_aesp8_xts_enc5x:
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vcipher $out4,$out4,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vcipher $out4,$out4,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz _aesp8_xts_enc5x
|
|
|
|
add $inp,$inp,$taillen
|
|
cmpwi $taillen,0
|
|
vcipher $out0,$out0,v24
|
|
vcipher $out1,$out1,v24
|
|
vcipher $out2,$out2,v24
|
|
vcipher $out3,$out3,v24
|
|
vcipher $out4,$out4,v24
|
|
|
|
subi $inp,$inp,16
|
|
vcipher $out0,$out0,v25
|
|
vcipher $out1,$out1,v25
|
|
vcipher $out2,$out2,v25
|
|
vcipher $out3,$out3,v25
|
|
vcipher $out4,$out4,v25
|
|
vxor $twk0,$twk0,v31
|
|
|
|
vcipher $out0,$out0,v26
|
|
lvsr $inpperm,r0,$taillen # $in5 is no more
|
|
vcipher $out1,$out1,v26
|
|
vcipher $out2,$out2,v26
|
|
vcipher $out3,$out3,v26
|
|
vcipher $out4,$out4,v26
|
|
vxor $in1,$twk1,v31
|
|
|
|
vcipher $out0,$out0,v27
|
|
lvx_u $in0,0,$inp
|
|
vcipher $out1,$out1,v27
|
|
vcipher $out2,$out2,v27
|
|
vcipher $out3,$out3,v27
|
|
vcipher $out4,$out4,v27
|
|
vxor $in2,$twk2,v31
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vcipher $out0,$out0,v28
|
|
vcipher $out1,$out1,v28
|
|
vcipher $out2,$out2,v28
|
|
vcipher $out3,$out3,v28
|
|
vcipher $out4,$out4,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
vxor $in3,$twk3,v31
|
|
|
|
vcipher $out0,$out0,v29
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vcipher $out1,$out1,v29
|
|
vcipher $out2,$out2,v29
|
|
vcipher $out3,$out3,v29
|
|
vcipher $out4,$out4,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vxor $in4,$twk4,v31
|
|
|
|
vcipher $out0,$out0,v30
|
|
vperm $in0,$in0,$in0,$inpperm
|
|
vcipher $out1,$out1,v30
|
|
vcipher $out2,$out2,v30
|
|
vcipher $out3,$out3,v30
|
|
vcipher $out4,$out4,v30
|
|
|
|
vcipherlast $out0,$out0,$twk0
|
|
vcipherlast $out1,$out1,$in1
|
|
vcipherlast $out2,$out2,$in2
|
|
vcipherlast $out3,$out3,$in3
|
|
vcipherlast $out4,$out4,$in4
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,0,0
|
|
|
|
.align 5
|
|
_aesp8_xts_decrypt6x:
|
|
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
|
|
mflr r11
|
|
li r7,`$FRAME+8*16+15`
|
|
li r3,`$FRAME+8*16+31`
|
|
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
|
|
stvx v20,r7,$sp # ABI says so
|
|
addi r7,r7,32
|
|
stvx v21,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v22,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v23,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v24,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v25,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v26,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v27,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v28,r7,$sp
|
|
addi r7,r7,32
|
|
stvx v29,r3,$sp
|
|
addi r3,r3,32
|
|
stvx v30,r7,$sp
|
|
stvx v31,r3,$sp
|
|
li r0,-1
|
|
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
|
|
li $x10,0x10
|
|
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
li $x20,0x20
|
|
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
li $x30,0x30
|
|
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
li $x40,0x40
|
|
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
li $x50,0x50
|
|
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
li $x60,0x60
|
|
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
li $x70,0x70
|
|
mtspr 256,r0
|
|
|
|
subi $rounds,$rounds,3 # -4 in total
|
|
|
|
lvx $rndkey0,$x00,$key1 # load key schedule
|
|
lvx v30,$x10,$key1
|
|
addi $key1,$key1,0x20
|
|
lvx v31,$x00,$key1
|
|
?vperm $rndkey0,$rndkey0,v30,$keyperm
|
|
addi $key_,$sp,$FRAME+15
|
|
mtctr $rounds
|
|
|
|
Load_xts_dec_key:
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v30,$x10,$key1
|
|
addi $key1,$key1,0x20
|
|
stvx v24,$x00,$key_ # off-load round[1]
|
|
?vperm v25,v31,v30,$keyperm
|
|
lvx v31,$x00,$key1
|
|
stvx v25,$x10,$key_ # off-load round[2]
|
|
addi $key_,$key_,0x20
|
|
bdnz Load_xts_dec_key
|
|
|
|
lvx v26,$x10,$key1
|
|
?vperm v24,v30,v31,$keyperm
|
|
lvx v27,$x20,$key1
|
|
stvx v24,$x00,$key_ # off-load round[3]
|
|
?vperm v25,v31,v26,$keyperm
|
|
lvx v28,$x30,$key1
|
|
stvx v25,$x10,$key_ # off-load round[4]
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
?vperm v26,v26,v27,$keyperm
|
|
lvx v29,$x40,$key1
|
|
?vperm v27,v27,v28,$keyperm
|
|
lvx v30,$x50,$key1
|
|
?vperm v28,v28,v29,$keyperm
|
|
lvx v31,$x60,$key1
|
|
?vperm v29,v29,v30,$keyperm
|
|
lvx $twk5,$x70,$key1 # borrow $twk5
|
|
?vperm v30,v30,v31,$keyperm
|
|
lvx v24,$x00,$key_ # pre-load round[1]
|
|
?vperm v31,v31,$twk5,$keyperm
|
|
lvx v25,$x10,$key_ # pre-load round[2]
|
|
|
|
vperm $in0,$inout,$inptail,$inpperm
|
|
subi $inp,$inp,31 # undo "caller"
|
|
vxor $twk0,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out0,$in0,$twk0
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in1,$x10,$inp
|
|
vxor $twk1,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in1,$in1,$in1,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out1,$in1,$twk1
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in2,$x20,$inp
|
|
andi. $taillen,$len,15
|
|
vxor $twk2,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in2,$in2,$in2,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out2,$in2,$twk2
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in3,$x30,$inp
|
|
sub $len,$len,$taillen
|
|
vxor $twk3,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in3,$in3,$in3,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out3,$in3,$twk3
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in4,$x40,$inp
|
|
subi $len,$len,0x60
|
|
vxor $twk4,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in4,$in4,$in4,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out4,$in4,$twk4
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
lvx_u $in5,$x50,$inp
|
|
addi $inp,$inp,0x60
|
|
vxor $twk5,$tweak,$rndkey0
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
le?vperm $in5,$in5,$in5,$leperm
|
|
vand $tmp,$tmp,$eighty7
|
|
vxor $out5,$in5,$twk5
|
|
vxor $tweak,$tweak,$tmp
|
|
|
|
vxor v31,v31,$rndkey0
|
|
mtctr $rounds
|
|
b Loop_xts_dec6x
|
|
|
|
.align 5
|
|
Loop_xts_dec6x:
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_xts_dec6x
|
|
|
|
subic $len,$len,96 # $len-=96
|
|
vxor $in0,$twk0,v31 # xor with last round key
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk0,$tweak,$rndkey0
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vncipher $out4,$out4,v24
|
|
vncipher $out5,$out5,v24
|
|
|
|
subfe. r0,r0,r0 # borrow?-1:0
|
|
vand $tmp,$tmp,$eighty7
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vxor $in1,$twk1,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk1,$tweak,$rndkey0
|
|
vncipher $out4,$out4,v25
|
|
vncipher $out5,$out5,v25
|
|
|
|
and r0,r0,$len
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vncipher $out0,$out0,v26
|
|
vncipher $out1,$out1,v26
|
|
vand $tmp,$tmp,$eighty7
|
|
vncipher $out2,$out2,v26
|
|
vncipher $out3,$out3,v26
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipher $out4,$out4,v26
|
|
vncipher $out5,$out5,v26
|
|
|
|
add $inp,$inp,r0 # $inp is adjusted in such
|
|
# way that at exit from the
|
|
# loop inX-in5 are loaded
|
|
# with last "words"
|
|
vxor $in2,$twk2,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk2,$tweak,$rndkey0
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vncipher $out0,$out0,v27
|
|
vncipher $out1,$out1,v27
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vncipher $out2,$out2,v27
|
|
vncipher $out3,$out3,v27
|
|
vand $tmp,$tmp,$eighty7
|
|
vncipher $out4,$out4,v27
|
|
vncipher $out5,$out5,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipher $out0,$out0,v28
|
|
vncipher $out1,$out1,v28
|
|
vxor $in3,$twk3,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk3,$tweak,$rndkey0
|
|
vncipher $out2,$out2,v28
|
|
vncipher $out3,$out3,v28
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vncipher $out4,$out4,v28
|
|
vncipher $out5,$out5,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
vand $tmp,$tmp,$eighty7
|
|
|
|
vncipher $out0,$out0,v29
|
|
vncipher $out1,$out1,v29
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipher $out2,$out2,v29
|
|
vncipher $out3,$out3,v29
|
|
vxor $in4,$twk4,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk4,$tweak,$rndkey0
|
|
vncipher $out4,$out4,v29
|
|
vncipher $out5,$out5,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
|
|
vncipher $out0,$out0,v30
|
|
vncipher $out1,$out1,v30
|
|
vand $tmp,$tmp,$eighty7
|
|
vncipher $out2,$out2,v30
|
|
vncipher $out3,$out3,v30
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipher $out4,$out4,v30
|
|
vncipher $out5,$out5,v30
|
|
vxor $in5,$twk5,v31
|
|
vsrab $tmp,$tweak,$seven # next tweak value
|
|
vxor $twk5,$tweak,$rndkey0
|
|
|
|
vncipherlast $out0,$out0,$in0
|
|
lvx_u $in0,$x00,$inp # load next input block
|
|
vaddubm $tweak,$tweak,$tweak
|
|
vsldoi $tmp,$tmp,$tmp,15
|
|
vncipherlast $out1,$out1,$in1
|
|
lvx_u $in1,$x10,$inp
|
|
vncipherlast $out2,$out2,$in2
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
lvx_u $in2,$x20,$inp
|
|
vand $tmp,$tmp,$eighty7
|
|
vncipherlast $out3,$out3,$in3
|
|
le?vperm $in1,$in1,$in1,$leperm
|
|
lvx_u $in3,$x30,$inp
|
|
vncipherlast $out4,$out4,$in4
|
|
le?vperm $in2,$in2,$in2,$leperm
|
|
lvx_u $in4,$x40,$inp
|
|
vxor $tweak,$tweak,$tmp
|
|
vncipherlast $out5,$out5,$in5
|
|
le?vperm $in3,$in3,$in3,$leperm
|
|
lvx_u $in5,$x50,$inp
|
|
addi $inp,$inp,0x60
|
|
le?vperm $in4,$in4,$in4,$leperm
|
|
le?vperm $in5,$in5,$in5,$leperm
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk0
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
vxor $out1,$in1,$twk1
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
vxor $out2,$in2,$twk2
|
|
le?vperm $out4,$out4,$out4,$leperm
|
|
stvx_u $out3,$x30,$out
|
|
vxor $out3,$in3,$twk3
|
|
le?vperm $out5,$out5,$out5,$leperm
|
|
stvx_u $out4,$x40,$out
|
|
vxor $out4,$in4,$twk4
|
|
stvx_u $out5,$x50,$out
|
|
vxor $out5,$in5,$twk5
|
|
addi $out,$out,0x60
|
|
|
|
mtctr $rounds
|
|
beq Loop_xts_dec6x # did $len-=96 borrow?
|
|
|
|
addic. $len,$len,0x60
|
|
beq Lxts_dec6x_zero
|
|
cmpwi $len,0x20
|
|
blt Lxts_dec6x_one
|
|
nop
|
|
beq Lxts_dec6x_two
|
|
cmpwi $len,0x40
|
|
blt Lxts_dec6x_three
|
|
nop
|
|
beq Lxts_dec6x_four
|
|
|
|
Lxts_dec6x_five:
|
|
vxor $out0,$in1,$twk0
|
|
vxor $out1,$in2,$twk1
|
|
vxor $out2,$in3,$twk2
|
|
vxor $out3,$in4,$twk3
|
|
vxor $out4,$in5,$twk4
|
|
|
|
bl _aesp8_xts_dec5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk5 # unused tweak
|
|
vxor $twk1,$tweak,$rndkey0
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk1
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
le?vperm $out4,$out4,$out4,$leperm
|
|
stvx_u $out3,$x30,$out
|
|
stvx_u $out4,$x40,$out
|
|
addi $out,$out,0x50
|
|
bne Lxts_dec6x_steal
|
|
b Lxts_dec6x_done
|
|
|
|
.align 4
|
|
Lxts_dec6x_four:
|
|
vxor $out0,$in2,$twk0
|
|
vxor $out1,$in3,$twk1
|
|
vxor $out2,$in4,$twk2
|
|
vxor $out3,$in5,$twk3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_dec5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk4 # unused tweak
|
|
vmr $twk1,$twk5
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk5
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
le?vperm $out3,$out3,$out3,$leperm
|
|
stvx_u $out2,$x20,$out
|
|
stvx_u $out3,$x30,$out
|
|
addi $out,$out,0x40
|
|
bne Lxts_dec6x_steal
|
|
b Lxts_dec6x_done
|
|
|
|
.align 4
|
|
Lxts_dec6x_three:
|
|
vxor $out0,$in3,$twk0
|
|
vxor $out1,$in4,$twk1
|
|
vxor $out2,$in5,$twk2
|
|
vxor $out3,$out3,$out3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_dec5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk3 # unused tweak
|
|
vmr $twk1,$twk4
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk4
|
|
le?vperm $out2,$out2,$out2,$leperm
|
|
stvx_u $out1,$x10,$out
|
|
stvx_u $out2,$x20,$out
|
|
addi $out,$out,0x30
|
|
bne Lxts_dec6x_steal
|
|
b Lxts_dec6x_done
|
|
|
|
.align 4
|
|
Lxts_dec6x_two:
|
|
vxor $out0,$in4,$twk0
|
|
vxor $out1,$in5,$twk1
|
|
vxor $out2,$out2,$out2
|
|
vxor $out3,$out3,$out3
|
|
vxor $out4,$out4,$out4
|
|
|
|
bl _aesp8_xts_dec5x
|
|
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
vmr $twk0,$twk2 # unused tweak
|
|
vmr $twk1,$twk3
|
|
le?vperm $out1,$out1,$out1,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
vxor $out0,$in0,$twk3
|
|
stvx_u $out1,$x10,$out
|
|
addi $out,$out,0x20
|
|
bne Lxts_dec6x_steal
|
|
b Lxts_dec6x_done
|
|
|
|
.align 4
|
|
Lxts_dec6x_one:
|
|
vxor $out0,$in5,$twk0
|
|
nop
|
|
Loop_xts_dec1x:
|
|
vncipher $out0,$out0,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out0,$out0,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Loop_xts_dec1x
|
|
|
|
subi r0,$taillen,1
|
|
vncipher $out0,$out0,v24
|
|
|
|
andi. r0,r0,16
|
|
cmpwi $taillen,0
|
|
vncipher $out0,$out0,v25
|
|
|
|
sub $inp,$inp,r0
|
|
vncipher $out0,$out0,v26
|
|
|
|
lvx_u $in0,0,$inp
|
|
vncipher $out0,$out0,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vncipher $out0,$out0,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
|
|
vncipher $out0,$out0,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vxor $twk0,$twk0,v31
|
|
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vncipher $out0,$out0,v30
|
|
|
|
mtctr $rounds
|
|
vncipherlast $out0,$out0,$twk0
|
|
|
|
vmr $twk0,$twk1 # unused tweak
|
|
vmr $twk1,$twk2
|
|
le?vperm $out0,$out0,$out0,$leperm
|
|
stvx_u $out0,$x00,$out # store output
|
|
addi $out,$out,0x10
|
|
vxor $out0,$in0,$twk2
|
|
bne Lxts_dec6x_steal
|
|
b Lxts_dec6x_done
|
|
|
|
.align 4
|
|
Lxts_dec6x_zero:
|
|
cmpwi $taillen,0
|
|
beq Lxts_dec6x_done
|
|
|
|
lvx_u $in0,0,$inp
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vxor $out0,$in0,$twk1
|
|
Lxts_dec6x_steal:
|
|
vncipher $out0,$out0,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out0,$out0,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz Lxts_dec6x_steal
|
|
|
|
add $inp,$inp,$taillen
|
|
vncipher $out0,$out0,v24
|
|
|
|
cmpwi $taillen,0
|
|
vncipher $out0,$out0,v25
|
|
|
|
lvx_u $in0,0,$inp
|
|
vncipher $out0,$out0,v26
|
|
|
|
lvsr $inpperm,0,$taillen # $in5 is no more
|
|
vncipher $out0,$out0,v27
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vncipher $out0,$out0,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
|
|
vncipher $out0,$out0,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vxor $twk1,$twk1,v31
|
|
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vncipher $out0,$out0,v30
|
|
|
|
vperm $in0,$in0,$in0,$inpperm
|
|
vncipherlast $tmp,$out0,$twk1
|
|
|
|
le?vperm $out0,$tmp,$tmp,$leperm
|
|
le?stvx_u $out0,0,$out
|
|
be?stvx_u $tmp,0,$out
|
|
|
|
vxor $out0,$out0,$out0
|
|
vspltisb $out1,-1
|
|
vperm $out0,$out0,$out1,$inpperm
|
|
vsel $out0,$in0,$tmp,$out0
|
|
vxor $out0,$out0,$twk0
|
|
|
|
subi r30,$out,1
|
|
mtctr $taillen
|
|
Loop_xts_dec6x_steal:
|
|
lbzu r0,1(r30)
|
|
stb r0,16(r30)
|
|
bdnz Loop_xts_dec6x_steal
|
|
|
|
li $taillen,0
|
|
mtctr $rounds
|
|
b Loop_xts_dec1x # one more time...
|
|
|
|
.align 4
|
|
Lxts_dec6x_done:
|
|
${UCMP}i $ivp,0
|
|
beq Lxts_dec6x_ret
|
|
|
|
vxor $tweak,$twk0,$rndkey0
|
|
le?vperm $tweak,$tweak,$tweak,$leperm
|
|
stvx_u $tweak,0,$ivp
|
|
|
|
Lxts_dec6x_ret:
|
|
mtlr r11
|
|
li r10,`$FRAME+15`
|
|
li r11,`$FRAME+31`
|
|
stvx $seven,r10,$sp # wipe copies of round keys
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
stvx $seven,r10,$sp
|
|
addi r10,r10,32
|
|
stvx $seven,r11,$sp
|
|
addi r11,r11,32
|
|
|
|
mtspr 256,$vrsave
|
|
lvx v20,r10,$sp # ABI says so
|
|
addi r10,r10,32
|
|
lvx v21,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v22,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v23,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v24,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v25,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v26,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v27,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v28,r10,$sp
|
|
addi r10,r10,32
|
|
lvx v29,r11,$sp
|
|
addi r11,r11,32
|
|
lvx v30,r10,$sp
|
|
lvx v31,r11,$sp
|
|
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
|
|
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
|
|
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
|
|
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
|
|
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
|
|
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
|
|
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x04,1,0x80,6,6,0
|
|
.long 0
|
|
|
|
.align 5
|
|
_aesp8_xts_dec5x:
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
lvx v24,$x20,$key_ # round[3]
|
|
addi $key_,$key_,0x20
|
|
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
lvx v25,$x10,$key_ # round[4]
|
|
bdnz _aesp8_xts_dec5x
|
|
|
|
subi r0,$taillen,1
|
|
vncipher $out0,$out0,v24
|
|
vncipher $out1,$out1,v24
|
|
vncipher $out2,$out2,v24
|
|
vncipher $out3,$out3,v24
|
|
vncipher $out4,$out4,v24
|
|
|
|
andi. r0,r0,16
|
|
cmpwi $taillen,0
|
|
vncipher $out0,$out0,v25
|
|
vncipher $out1,$out1,v25
|
|
vncipher $out2,$out2,v25
|
|
vncipher $out3,$out3,v25
|
|
vncipher $out4,$out4,v25
|
|
vxor $twk0,$twk0,v31
|
|
|
|
sub $inp,$inp,r0
|
|
vncipher $out0,$out0,v26
|
|
vncipher $out1,$out1,v26
|
|
vncipher $out2,$out2,v26
|
|
vncipher $out3,$out3,v26
|
|
vncipher $out4,$out4,v26
|
|
vxor $in1,$twk1,v31
|
|
|
|
vncipher $out0,$out0,v27
|
|
lvx_u $in0,0,$inp
|
|
vncipher $out1,$out1,v27
|
|
vncipher $out2,$out2,v27
|
|
vncipher $out3,$out3,v27
|
|
vncipher $out4,$out4,v27
|
|
vxor $in2,$twk2,v31
|
|
|
|
addi $key_,$sp,$FRAME+15 # rewind $key_
|
|
vncipher $out0,$out0,v28
|
|
vncipher $out1,$out1,v28
|
|
vncipher $out2,$out2,v28
|
|
vncipher $out3,$out3,v28
|
|
vncipher $out4,$out4,v28
|
|
lvx v24,$x00,$key_ # re-pre-load round[1]
|
|
vxor $in3,$twk3,v31
|
|
|
|
vncipher $out0,$out0,v29
|
|
le?vperm $in0,$in0,$in0,$leperm
|
|
vncipher $out1,$out1,v29
|
|
vncipher $out2,$out2,v29
|
|
vncipher $out3,$out3,v29
|
|
vncipher $out4,$out4,v29
|
|
lvx v25,$x10,$key_ # re-pre-load round[2]
|
|
vxor $in4,$twk4,v31
|
|
|
|
vncipher $out0,$out0,v30
|
|
vncipher $out1,$out1,v30
|
|
vncipher $out2,$out2,v30
|
|
vncipher $out3,$out3,v30
|
|
vncipher $out4,$out4,v30
|
|
|
|
vncipherlast $out0,$out0,$twk0
|
|
vncipherlast $out1,$out1,$in1
|
|
vncipherlast $out2,$out2,$in2
|
|
vncipherlast $out3,$out3,$in3
|
|
vncipherlast $out4,$out4,$in4
|
|
mtctr $rounds
|
|
blr
|
|
.long 0
|
|
.byte 0,12,0x14,0,0,0,0,0
|
|
___
|
|
}} }}}
|
|
|
|
my $consts=1;
|
|
foreach(split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/geo;
|
|
|
|
# constants table endian-specific conversion
|
|
if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
|
|
my $conv=$3;
|
|
my @bytes=();
|
|
|
|
# convert to endian-agnostic format
|
|
if ($1 eq "long") {
|
|
foreach (split(/,\s*/,$2)) {
|
|
my $l = /^0/?oct:int;
|
|
push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
|
|
}
|
|
} else {
|
|
@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
|
|
}
|
|
|
|
# little-endian conversion
|
|
if ($flavour =~ /le$/o) {
|
|
SWITCH: for($conv) {
|
|
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
|
|
/\?rev/ && do { @bytes=reverse(@bytes); last; };
|
|
}
|
|
}
|
|
|
|
#emit
|
|
print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
|
|
next;
|
|
}
|
|
$consts=0 if (m/Lconsts:/o); # end of table
|
|
|
|
# instructions prefixed with '?' are endian-specific and need
|
|
# to be adjusted accordingly...
|
|
if ($flavour =~ /le$/o) { # little-endian
|
|
s/le\?//o or
|
|
s/be\?/#be#/o or
|
|
s/\?lvsr/lvsl/o or
|
|
s/\?lvsl/lvsr/o or
|
|
s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
|
|
s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
|
|
s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
|
|
} else { # big-endian
|
|
s/le\?/#le#/o or
|
|
s/be\?//o or
|
|
s/\?([a-z]+)/$1/o;
|
|
}
|
|
|
|
print $_,"\n";
|
|
}
|
|
|
|
close STDOUT;
|