Optimize chacha20 on aarch64 by SVE2

This patch improves existing chacha20 SVE patch by using SVE2,
which is an optional architecture feature of aarch64, with XAR
instruction that can improve the performance of chacha20.

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18522)
This commit is contained in:
Daniel Hu 2022-05-25 10:23:40 +01:00 committed by Pauli
parent b147b9daf1
commit bcb52bcc9f
1 changed files with 237 additions and 164 deletions

View File

@ -31,17 +31,25 @@ sub AUTOLOAD() # thunk [simplified] x86-style perlasm
} }
my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4)); my ($outp,$inp,$len,$key,$ctr) = map("x$_",(0..4));
my ($state) = ("x5"); my ($veclen_w,$veclen,$blocks) = ("w5","x5","x6");
my ($veclen_w,$veclen,$blocks) = ("w6","x6","x7"); my ($sve2flag) = ("x7");
my ($saved_outp) = ("x8"); my ($wctr, $xctr) = ("w8", "x8");
my ($wctr, $xctr) = ("w9", "x9"); my ($tmpw0,$tmp0,$tmpw1,$tmp1) = ("w9","x9", "w10","x10");
my @mx=map("z$_",(0..7,16..23)); my ($tmp,$tmpw) = ("x10", "w10");
my ($counter) = ("x11");
my @K=map("x$_",(12..15,19..22));
my @KL=map("w$_",(12..15,19..22));
my @mx=map("z$_",(0..15));
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx; $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @mx;
my @xt=map("z$_",(24..31,8..11)); my ($zctr) = ("z16");
my ($rot8) = ("z12"); my @xt=map("z$_",(17..24));
my ($zctr) = ("z13"); my @perm=map("z$_",(25..30));
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7,$xt8,$xt9,$xt10,$xt11)=@xt; my ($rot8) = ("z31");
my ($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7)=@xt;
# in SVE mode we can only use bak0 ~ bak9 (the rest used as scratch register)
# in SVE2 we use all 15 backup register
my ($bak0,$bak1,$bak2,$bak3,$bak4,$bak5,$bak6,$bak7,$bak8,$bak9,$bak10,$bak11,$bak13,$bak14,$bak15)=(@perm[0],@perm[1],@perm[2],@perm[3],@perm[4],@perm[5],$xt4,$xt5,$xt6,$xt7,$xt0,$xt1,$xt2,$xt3,$rot8);
my $debug_encoder=0; my $debug_encoder=0;
sub SVE_ADD() { sub SVE_ADD() {
@ -148,8 +156,12 @@ sub SVE_QR_GROUP() {
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_; my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$a3,$b3,$c3,$d3) = @_;
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); if ($have_sve2 == 0) {
&SVE_REV16($d0,$d1,$d2,$d3); &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_REV16($d0,$d1,$d2,$d3);
} else {
&SVE2_XAR(16,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
}
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) { if ($have_sve2 == 0) {
@ -162,8 +174,12 @@ sub SVE_QR_GROUP() {
} }
&SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3); &SVE_ADD($a0,$b0,$a1,$b1,$a2,$b2,$a3,$b3);
&SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3); if ($have_sve2 == 0) {
&SVE_ROT8($d0,$d1,$d2,$d3); &SVE_EOR($d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
&SVE_ROT8($d0,$d1,$d2,$d3);
} else {
&SVE2_XAR(8,$d0,$a0,$d1,$a1,$d2,$a2,$d3,$a3);
}
&SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3); &SVE_ADD($c0,$d0,$c1,$d1,$c2,$d2,$c3,$d3);
if ($have_sve2 == 0) { if ($have_sve2 == 0) {
@ -178,26 +194,31 @@ sub SVE_QR_GROUP() {
sub SVE_INNER_BLOCK() { sub SVE_INNER_BLOCK() {
$code.=<<___; $code.=<<___;
//cbnz $sve2flag, 10f mov $counter,#10
1:
.align 5
___ ___
&SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15); &SVE_QR_GROUP(0,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14); &SVE_QR_GROUP(0,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___; $code.=<<___;
// SVE 2 not enabled until hardware available subs $counter,$counter,1
#if 0 b.ne 1b
b 11f
10:
___
# &SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
# &SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
11:
#endif
___ ___
} }
{{{ sub SVE2_INNER_BLOCK() {
my ($dlen,$rsize,$tmp) = ("x10","x11","x12"); $code.=<<___;
mov $counter,#10
1:
.align 5
___
&SVE_QR_GROUP(1,0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15);
&SVE_QR_GROUP(1,0,5,10,15,1,6,11,12,2,7,8,13,3,4,9,14);
$code.=<<___;
subs $counter,$counter,1
b.ne 1b
___
}
sub load() { sub load() {
my $x0 = shift; my $x0 = shift;
@ -252,72 +273,75 @@ sub transpose() {
my $xd = shift; my $xd = shift;
$code.=<<___; $code.=<<___;
zip1 $xt8.s,$xa.s,$xb.s zip1 $xt0.s,$xa.s,$xb.s
zip2 $xt9.s,$xa.s,$xb.s zip2 $xt1.s,$xa.s,$xb.s
zip1 $xt10.s,$xc.s,$xd.s zip1 $xt2.s,$xc.s,$xd.s
zip2 $xt11.s,$xc.s,$xd.s zip2 $xt3.s,$xc.s,$xd.s
zip1 $xa.d,$xt8.d,$xt10.d zip1 $xa.d,$xt0.d,$xt2.d
zip2 $xb.d,$xt8.d,$xt10.d zip2 $xb.d,$xt0.d,$xt2.d
zip1 $xc.d,$xt9.d,$xt11.d zip1 $xc.d,$xt1.d,$xt3.d
zip2 $xd.d,$xt9.d,$xt11.d zip2 $xd.d,$xt1.d,$xt3.d
___ ___
} }
sub add_states() { sub SVE_ADD_STATES() {
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
$code.=<<___; $code.=<<___;
ldp $tmpw0,$tmpw1,[$state] lsr $tmp1,@K[5],#32
ldp $tmpw2,$tmpw3,[$state,#8] dup $xt0.s,@KL[5]
dup $xt0.s,$tmpw0
dup $xt1.s,$tmpw1 dup $xt1.s,$tmpw1
dup $xt2.s,$tmpw2 add @mx[0].s,@mx[0].s,$bak0.s
dup $xt3.s,$tmpw3 add @mx[1].s,@mx[1].s,$bak1.s
ldp $tmpw0,$tmpw1,[$state,#16] add @mx[2].s,@mx[2].s,$bak2.s
ldp $tmpw2,$tmpw3,[$state,#24] add @mx[3].s,@mx[3].s,$bak3.s
add @mx[0].s,@mx[0].s,$xt0.s add @mx[4].s,@mx[4].s,$bak4.s
add @mx[1].s,@mx[1].s,$xt1.s add @mx[5].s,@mx[5].s,$bak5.s
add @mx[2].s,@mx[2].s,$xt2.s add @mx[6].s,@mx[6].s,$bak6.s
add @mx[3].s,@mx[3].s,$xt3.s add @mx[7].s,@mx[7].s,$bak7.s
add @mx[8].s,@mx[8].s,$bak8.s
add @mx[9].s,@mx[9].s,$bak9.s
lsr $tmp0,@K[6],#32
dup $xt4.s,$tmpw0 dup $xt4.s,$tmpw0
dup $xt5.s,$tmpw1 lsr $tmp1,@K[7],#32
dup $xt6.s,$tmpw2 dup $xt5.s,@KL[7]
dup $xt7.s,$tmpw3 dup $xt6.s,$tmpw1
ldp $tmpw0,$tmpw1,[$state,#32] add @mx[10].s,@mx[10].s,$xt0.s
ldp $tmpw2,$tmpw3,[$state,#40] add @mx[11].s,@mx[11].s,$xt1.s
add @mx[4].s,@mx[4].s,$xt4.s
add @mx[5].s,@mx[5].s,$xt5.s
add @mx[6].s,@mx[6].s,$xt6.s
add @mx[7].s,@mx[7].s,$xt7.s
dup $xt0.s,$tmpw0
dup $xt1.s,$tmpw1
dup $xt2.s,$tmpw2
dup $xt3.s,$tmpw3
ldp $tmpw0,$tmpw1,[$state,#48]
ldp $tmpw2,$tmpw3,[$state,#56]
add @mx[8].s,@mx[8].s,$xt0.s
add @mx[9].s,@mx[9].s,$xt1.s
add @mx[10].s,@mx[10].s,$xt2.s
add @mx[11].s,@mx[11].s,$xt3.s
dup $xt5.s,$tmpw1
dup $xt6.s,$tmpw2
dup $xt7.s,$tmpw3
add @mx[12].s,@mx[12].s,$zctr.s add @mx[12].s,@mx[12].s,$zctr.s
add @mx[13].s,@mx[13].s,$xt5.s add @mx[13].s,@mx[13].s,$xt4.s
add @mx[14].s,@mx[14].s,$xt6.s add @mx[14].s,@mx[14].s,$xt5.s
add @mx[15].s,@mx[15].s,$xt7.s add @mx[15].s,@mx[15].s,$xt6.s
___
}
sub SVE2_ADD_STATES() {
$code.=<<___;
add @mx[0].s,@mx[0].s,$bak0.s
add @mx[1].s,@mx[1].s,$bak1.s
add @mx[2].s,@mx[2].s,$bak2.s
add @mx[3].s,@mx[3].s,$bak3.s
add @mx[4].s,@mx[4].s,$bak4.s
add @mx[5].s,@mx[5].s,$bak5.s
add @mx[6].s,@mx[6].s,$bak6.s
add @mx[7].s,@mx[7].s,$bak7.s
add @mx[8].s,@mx[8].s,$bak8.s
add @mx[9].s,@mx[9].s,$bak9.s
add @mx[10].s,@mx[10].s,$bak10.s
add @mx[11].s,@mx[11].s,$bak11.s
add @mx[12].s,@mx[12].s,$zctr.s
add @mx[13].s,@mx[13].s,$bak13.s
add @mx[14].s,@mx[14].s,$bak14.s
add @mx[15].s,@mx[15].s,$bak15.s
___ ___
} }
sub SVE_TRANSFORMS() { sub SVE_TRANSFORMS() {
&add_states();
&transpose($xa0,$xb0,$xc0,$xd0); &transpose($xa0,$xb0,$xc0,$xd0);
&transpose($xa1,$xb1,$xc1,$xd1); &transpose($xa1,$xb1,$xc1,$xd1);
&transpose($xa2,$xb2,$xc2,$xd2); &transpose($xa2,$xb2,$xc2,$xd2);
&transpose($xa3,$xb3,$xc3,$xd3); &transpose($xa3,$xb3,$xc3,$xd3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xa0,$xa1,$xa2,$xa3); &transpose($xa0,$xa1,$xa2,$xa3);
&transpose($xb0,$xb1,$xb2,$xb3); &transpose($xb0,$xb1,$xb2,$xb3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___; $code.=<<___;
eor $xa0.d,$xa0.d,$xt0.d eor $xa0.d,$xa0.d,$xt0.d
eor $xa1.d,$xa1.d,$xt1.d eor $xa1.d,$xa1.d,$xt1.d
@ -330,8 +354,8 @@ $code.=<<___;
___ ___
&transpose($xc0,$xc1,$xc2,$xc3); &transpose($xc0,$xc1,$xc2,$xc3);
&store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3); &store($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
&transpose($xd0,$xd1,$xd2,$xd3); &transpose($xd0,$xd1,$xd2,$xd3);
&load($xt0,$xt1,$xt2,$xt3,$xt4,$xt5,$xt6,$xt7);
$code.=<<___; $code.=<<___;
eor $xc0.d,$xc0.d,$xt0.d eor $xc0.d,$xc0.d,$xt0.d
eor $xc1.d,$xc1.d,$xt1.d eor $xc1.d,$xc1.d,$xt1.d
@ -348,73 +372,111 @@ $code.=<<___;
incw $zctr.s, ALL, MUL #1 incw $zctr.s, ALL, MUL #1
___ ___
} }
}}}
sub SVE_LOAD_STATES() { sub SVE_LOAD_STATES() {
my ($tmpw0,$tmpw1,$tmpw2,$tmpw3) = ("w10","w11","w12","w13");
$code.=<<___; $code.=<<___;
// FIXME following code are not functionally necessary lsr $tmp0,@K[0],#32
// but appear to enhance performance dup @mx[0].s,@KL[0]
#if 1 dup $bak0.s,@KL[0]
ptrues p2.s,ALL dup @mx[1].s,$tmpw0
ptrues p2.s,ALL dup $bak1.s,$tmpw0
ptrues p2.s,ALL lsr $tmp1,@K[1],#32
ptrues p2.s,ALL dup @mx[2].s,@KL[1]
ptrues p2.s,ALL dup $bak2.s,@KL[1]
ptrues p2.s,ALL dup @mx[3].s,$tmpw1
#endif dup $bak3.s,$tmpw1
lsr $tmp0,@K[2],#32
dup @mx[4].s,@KL[2]
dup $bak4.s,@KL[2]
dup @mx[5].s,$tmpw0
dup $bak5.s,$tmpw0
lsr $tmp1,@K[3],#32
dup @mx[6].s,@KL[3]
dup $bak6.s,@KL[3]
dup @mx[7].s,$tmpw1
dup $bak7.s,$tmpw1
lsr $tmp0,@K[4],#32
dup @mx[8].s,@KL[4]
dup $bak8.s,@KL[4]
dup @mx[9].s,$tmpw0
dup $bak9.s,$tmpw0
lsr $tmp1,@K[5],#32
dup @mx[10].s,@KL[5]
dup @mx[11].s,$tmpw1
orr @mx[12].d,$zctr.d,$zctr.d
lsr $tmp0,@K[6],#32
dup @mx[13].s,$tmpw0
lsr $tmp1,@K[7],#32
dup @mx[14].s,@KL[7]
dup @mx[15].s,$tmpw1
___ ___
}
sub SVE2_LOAD_STATES() {
$code.=<<___; $code.=<<___;
ldp $tmpw0,$tmpw1,[$state] lsr $tmp0,@K[0],#32
ldp $tmpw2,$tmpw3,[$state,#8] dup @mx[0].s,@KL[0]
dup @mx[0].s,$tmpw0 dup $bak0.s,@KL[0]
dup @mx[1].s,$tmpw1 dup @mx[1].s,$tmpw0
dup @mx[2].s,$tmpw2 dup $bak1.s,$tmpw0
dup @mx[3].s,$tmpw3 lsr $tmp1,@K[1],#32
ldp $tmpw0,$tmpw1,[$state,#16] dup @mx[2].s,@KL[1]
ldp $tmpw2,$tmpw3,[$state,#24] dup $bak2.s,@KL[1]
dup @mx[4].s,$tmpw0 dup @mx[3].s,$tmpw1
dup @mx[5].s,$tmpw1 dup $bak3.s,$tmpw1
dup @mx[6].s,$tmpw2 lsr $tmp0,@K[2],#32
dup @mx[7].s,$tmpw3 dup @mx[4].s,@KL[2]
ldp $tmpw0,$tmpw1,[$state,#32] dup $bak4.s,@KL[2]
ldp $tmpw2,$tmpw3,[$state,#40] dup @mx[5].s,$tmpw0
dup @mx[8].s,$tmpw0 dup $bak5.s,$tmpw0
dup @mx[9].s,$tmpw1 lsr $tmp1,@K[3],#32
dup @mx[10].s,$tmpw2 dup @mx[6].s,@KL[3]
dup @mx[11].s,$tmpw3 dup $bak6.s,@KL[3]
ldp $tmpw0,$tmpw1,[$state, #48] dup @mx[7].s,$tmpw1
ldp $tmpw2,$tmpw3,[$state,#56] dup $bak7.s,$tmpw1
mov @mx[12].s,p0/m,$zctr.s lsr $tmp0,@K[4],#32
dup @mx[13].s,$tmpw1 dup @mx[8].s,@KL[4]
dup @mx[14].s,$tmpw2 dup $bak8.s,@KL[4]
dup @mx[15].s,$tmpw3 dup @mx[9].s,$tmpw0
dup $bak9.s,$tmpw0
lsr $tmp1,@K[5],#32
dup @mx[10].s,@KL[5]
dup $bak10.s,@KL[5]
dup @mx[11].s,$tmpw1
dup $bak11.s,$tmpw1
orr @mx[12].d,$zctr.d,$zctr.d
lsr $tmp0,@K[6],#32
dup @mx[13].s,$tmpw0
dup $bak13.s,$tmpw0
lsr $tmp1,@K[7],#32
dup @mx[14].s,@KL[7]
dup $bak14.s,@KL[7]
dup @mx[15].s,$tmpw1
dup $bak15.s,$tmpw1
___ ___
} }
sub sve_handle_blocks() { sub sve_handle_blocks() {
my ($counter) = ("x10");
&SVE_LOAD_STATES();
$code.=<<___; $code.=<<___;
mov $counter,#10 cbz $sve2flag,.sve_inner
.align 5
1:
___ ___
&SVE2_LOAD_STATES();
&SVE_INNER_BLOCK(); &SVE2_INNER_BLOCK();
&SVE2_ADD_STATES();
$code.=<<___; $code.=<<___;
subs $counter,$counter,1 b .fini_inner
b.ne 1b .sve_inner:
___
&SVE_LOAD_STATES();
&SVE_INNER_BLOCK();
&SVE_ADD_STATES();
$code.=<<___;
.fini_inner:
___ ___
&SVE_TRANSFORMS(); &SVE_TRANSFORMS();
} }
sub chacha20_process() { sub chacha20_process() {
my ($counter) = ("x10");
my ($tmpw) = ("w11");
$code.=<<___; $code.=<<___;
.align 5 .align 5
.Loop: .Loop:
@ -430,27 +492,18 @@ ___
} }
{{{ {{{
my ($tmp,$tmpw) = ("x10", "w10");
my ($tmpw0,$tmpw1) = ("w11", "w12");
my ($ptr) = ("x13");
$code.=<<___; $code.=<<___;
#include "arm_arch.h" #include "arm_arch.h"
.arch armv8-a .arch armv8-a
#if 0
.extern OPENSSL_armcap_P .extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P .hidden OPENSSL_armcap_P
#endif
.text .text
.align 5 .align 5
.Lchacha20_consts: .Lchacha20_consts:
.word 0x61707865 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.word 0x3320646e
.word 0x79622d32
.word 0x6b206574
.Lrot8: .Lrot8:
.word 0x02010003,0x04040404,0x02010003,0x04040404 .word 0x02010003,0x04040404,0x02010003,0x04040404
.globl ChaCha20_ctr32_sve .globl ChaCha20_ctr32_sve
@ -458,49 +511,55 @@ $code.=<<___;
.align 5 .align 5
ChaCha20_ctr32_sve: ChaCha20_ctr32_sve:
AARCH64_VALID_CALL_TARGET AARCH64_VALID_CALL_TARGET
mov $tmp, #64 cntw $veclen, ALL, MUL #1
whilelo p0.s,xzr,$tmp
cntp $veclen,p0,p0.s
// run Neon if we only have 128-bit SVE
// in the future, we need to check SVE2
cmp $veclen,4
b.le .Lreturn
lsr $blocks,$len,#6 lsr $blocks,$len,#6
cmp $blocks,$veclen cmp $blocks,$veclen
b.lt .Lreturn b.lt .Lreturn
stp d8,d9,[sp,-48]!
stp d10,d11,[sp,16]
stp d12,d13,[sp,32]
sub sp,sp,#64
adr $tmp,.Lchacha20_consts
ld1 {v0.4s},[$tmp]
adr $tmp,.Lrot8
ldp $tmpw0,$tmpw1,[$tmp]
ld1 {v1.4s,v2.4s},[$key]
ld1 {v3.4s},[$ctr]
ldr $wctr,[$ctr]
index $zctr.s,$wctr,1
index $rot8.s,$tmpw0,$tmpw1
st1 {v0.4s,v1.4s,v2.4s,v3.4s},[sp]
mov $state,sp
#if 0
// SVE2 code not enabled until we have hardware
// for verification
mov $sve2flag,0 mov $sve2flag,0
adrp $tmp,OPENSSL_armcap_P adrp $tmp,OPENSSL_armcap_P
ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P] ldr $tmpw,[$tmp,#:lo12:OPENSSL_armcap_P]
tst $tmpw,#ARMV8_SVE2 tst $tmpw,#ARMV8_SVE2
b.eq 1f b.eq 1f
mov $sve2flag,1 mov $sve2flag,1
b 2f
1: 1:
cmp $veclen,4
b.le .Lreturn
adr $tmp,.Lrot8
ldp $tmpw0,$tmpw1,[$tmp]
index $rot8.s,$tmpw0,$tmpw1
2:
stp d8,d9,[sp,-96]!
stp d10,d11,[sp,16]
stp d12,d13,[sp,32]
stp d14,d15,[sp,48]
stp x19,x20,[sp,64]
stp x21,x22,[sp,80]
adr $tmp,.Lchacha20_consts
ldp @K[0],@K[1],[$tmp]
ldp @K[2],@K[3],[$key]
ldp @K[4],@K[5],[$key, 16]
ldp @K[6],@K[7],[$ctr]
ldr $wctr,[$ctr]
index $zctr.s,$wctr,1
ptrues p0.s,ALL
#ifdef __AARCH64EB__
ror @K[2],@K[2],#32
ror @K[3],@K[3],#32
ror @K[4],@K[4],#32
ror @K[5],@K[5],#32
ror @K[6],@K[6],#32
ror @K[7],@K[7],#32
#endif #endif
___ ___
&chacha20_process(); &chacha20_process();
$code.=<<___; $code.=<<___;
add sp,sp,#64
ldp d10,d11,[sp,16] ldp d10,d11,[sp,16]
ldp d12,d13,[sp,32] ldp d12,d13,[sp,32]
ldp d8,d9,[sp],48 ldp d14,d15,[sp,48]
ldp x19,x20,[sp,64]
ldp x21,x22,[sp,80]
ldp d8,d9,[sp],96
str $wctr,[$ctr] str $wctr,[$ctr]
and $len,$len,#63 and $len,$len,#63
add $len,$len,$blocks,lsl #6 add $len,$len,$blocks,lsl #6
@ -514,6 +573,7 @@ ___
######################################## ########################################
{ {
my %opcode_unpred = ( my %opcode_unpred = (
"movprfx" => 0x0420BC00,
"eor" => 0x04a03000, "eor" => 0x04a03000,
"add" => 0x04200000, "add" => 0x04200000,
"orr" => 0x04603000, "orr" => 0x04603000,
@ -528,6 +588,7 @@ my %opcode_unpred = (
"index" => 0x04204C00, "index" => 0x04204C00,
"mov" => 0x05203800, "mov" => 0x05203800,
"dup" => 0x05203800, "dup" => 0x05203800,
"cntw" => 0x04A0E000,
"tbl" => 0x05203000); "tbl" => 0x05203000);
my %opcode_imm_unpred = ( my %opcode_imm_unpred = (
@ -564,6 +625,7 @@ my %opcode_pred = (
"st4w" => 0xE570E000, "st4w" => 0xE570E000,
"st1w" => 0xE500E000, "st1w" => 0xE500E000,
"ld1w" => 0xA540A000, "ld1w" => 0xA540A000,
"ld1rw" => 0x8540C000,
"revh" => 0x05258000); "revh" => 0x05258000);
my %tsize = ( my %tsize = (
@ -740,6 +802,10 @@ sub sve_pred {
if ($addr =~ m/x([0-9]+)\s*/o) { if ($addr =~ m/x([0-9]+)\s*/o) {
$xn = $1; $xn = $1;
} }
if ($mnemonic =~m/ld1r[bhwd]/o) {
$size = 0;
}
if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) { if ($addr =~ m/\w+\s*,\s*x([0-9]+),.*/o) {
return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst); return &verify_inst($opcode_scalar_pred{$mnemonic}|($size<<21)|$zt|($pg<<10)|($1<<16)|($xn<<5),$inst);
} elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) { } elsif ($addr =~ m/\w+\s*,\s*z([0-9]+)\.s,\s*([US]\w+)/o) {
@ -810,8 +876,14 @@ sub sve_other {
} elsif ($arg =~ m/x([0-9]+)/o) { } elsif ($arg =~ m/x([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst); return &verify_inst($opcode_unpred{$mnemonic}|$1|(31<<5)|(0<<16), $inst);
} }
} elsif ($mnemonic =~ /cnt[bhdw]/) {
if ($arg =~ m/x([0-9]+)[^,]*,\s*(\w+)[^,]*,\s*MUL\s*#?([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|($pattern{$2}<<5)|(($3 - 1)<<16), $inst);
}
} elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) { } elsif ($arg =~ m/x([0-9]+)[^,]*,\s*x([0-9]+)[^,]*,\s*#?([0-9]+)/o) {
return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst); return &verify_inst($opcode_pred{$mnemonic}|$1|($2<<16)|($3<<5), $inst);
} elsif ($arg =~ m/z([0-9]+)[^,]*,\s*z([0-9]+)/o) {
return &verify_inst($opcode_unpred{$mnemonic}|$1|($2<<5), $inst);
} }
sprintf "%s // fail to parse", $inst; sprintf "%s // fail to parse", $inst;
} }
@ -834,9 +906,10 @@ foreach(split("\n",$code)) {
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge; s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*[#zwx]?[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge; s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*\{.*\},\s*z[0-9]+.*)/sve_unpred($1,$2)/ge;
s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge; s/\b(\w+)\s+(z[0-9]+\.[bhsd],\s*p[0-9].*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4]r[bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge; s/\b(\w+[1-4][bhwd])\s+(\{\s*z[0-9]+.*\},\s*p[0-9]+.*)/sve_pred($1,$2)/ge;
s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge; s/\b(\w+)\s+(p[0-9]+\.[bhsd].*)/sve_pred($1,$2)/ge;
s/\b(cntp|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge; s/\b(movprfx|cntp|cnt[bhdw]|addvl|inc[bhdw])\s+((x|z).*)/sve_other($1,$2)/ge;
print $_,"\n"; print $_,"\n";
} }