Fixed incorrect usage of vshuf.b instruction

In the definition of the latest revised LoongArch64 vector instruction manual,
it is clearly pointed out that the undefined upper three bits of each byte in
the control register of the vshuf.b instruction should not be used, otherwise
uncertain results may be obtained. Therefore, it is necessary to correct the
use of the vshuf.b instruction in the existing vpaes-loongarch64.pl code to
avoid erroneous calculation results in future LoongArch64 processors.

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21530)
This commit is contained in:
zhuchen 2023-07-24 16:03:29 +08:00 committed by Tomas Mraz
parent 160f48941d
commit 780ce3849f
1 changed files with 70 additions and 69 deletions

View File

@ -62,14 +62,14 @@ _vpaes_encrypt_core:
ld.w $t5,$a2,240 ld.w $t5,$a2,240
vori.b $vr1,$vr9,0 vori.b $vr1,$vr9,0
la.local $t0,Lk_ipt la.local $t0,Lk_ipt
vld $vr2,$t0,0 # iptlo vld $vr2,$t0,0 # iptlo
vandn.v $vr1,$vr1,$vr0 vandn.v $vr1,$vr1,$vr0
vld $vr5,$a5,0 # round0 key vld $vr5,$a5,0 # round0 key
vsrli.w $vr1,$vr1,4 vsrli.w $vr1,$vr1,4
vand.v $vr0,$vr0,$vr9 vand.v $vr0,$vr0,$vr9
vshuf.b $vr2,$vr0,$vr2,$vr0 vshuf.b $vr2,$vr18,$vr2,$vr0
vld $vr0,$t0,16 # ipthi vld $vr0,$t0,16 # ipthi
vshuf.b $vr0,$vr1,$vr0,$vr1 vshuf.b $vr0,$vr18,$vr0,$vr1
vxor.v $vr2,$vr2,$vr5 vxor.v $vr2,$vr2,$vr5
addi.d $a5,$a5,16 addi.d $a5,$a5,16
vxor.v $vr0,$vr0,$vr2 vxor.v $vr0,$vr0,$vr2
@ -81,26 +81,26 @@ _vpaes_encrypt_core:
# middle of middle round # middle of middle round
vori.b $vr4,$vr13,0 # 4 : sb1u vori.b $vr4,$vr13,0 # 4 : sb1u
vori.b $vr0,$vr12,0 # 0 : sb1t vori.b $vr0,$vr12,0 # 0 : sb1t
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb1u vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb1u
vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
vori.b $vr5,$vr15,0 # 4 : sb2u vori.b $vr5,$vr15,0 # 4 : sb2u
vxor.v $vr0,$vr0,$vr4 # 0 = A vxor.v $vr0,$vr0,$vr4 # 0 = A
add.d $t0,$a7,$a6 # Lk_mc_forward[] add.d $t0,$a7,$a6 # Lk_mc_forward[]
vld $vr1,$t0,-0x40 vld $vr1,$t0,-0x40
vshuf.b $vr5,$vr2,$vr5,$vr2 # 4 = sb2u vshuf.b $vr5,$vr18,$vr5,$vr2 # 4 = sb2u
vld $vr4,$t0,0 # Lk_mc_backward[] vld $vr4,$t0,0 # Lk_mc_backward[]
vori.b $vr2,$vr14,0 # 2 : sb2t vori.b $vr2,$vr14,0 # 2 : sb2t
vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = sb2t vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = sb2t
vori.b $vr3,$vr0,0 # 3 = A vori.b $vr3,$vr0,0 # 3 = A
vxor.v $vr2,$vr5,$vr2 # 2 = 2A vxor.v $vr2,$vr5,$vr2 # 2 = 2A
vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = B vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = B
addi.d $a5,$a5,16 # next key addi.d $a5,$a5,16 # next key
vxor.v $vr0,$vr0,$vr2 # 0 = 2A+B vxor.v $vr0,$vr0,$vr2 # 0 = 2A+B
vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = D vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = D
addi.d $a7,$a7,16 # next mc addi.d $a7,$a7,16 # next mc
vxor.v $vr3,$vr3,$vr0 # 3 = 2A+B+D vxor.v $vr3,$vr3,$vr0 # 3 = 2A+B+D
vshuf.b $vr0,$vr1,$vr0,$vr1 # 0 = 2B+C vshuf.b $vr0,$vr18,$vr0,$vr1 # 0 = 2B+C
andi $a7,$a7,0x30 # ... mod 4 andi $a7,$a7,0x30 # ... mod 4
addi.d $t5,$t5,-1 # nr-- addi.d $t5,$t5,-1 # nr--
vxor.v $vr0,$vr0,$vr3 # 0 = 2A+3B+C+D vxor.v $vr0,$vr0,$vr3 # 0 = 2A+3B+C+D
@ -112,33 +112,33 @@ _vpaes_encrypt_core:
vandn.v $vr1,$vr1,$vr0 # 1 = i<<4 vandn.v $vr1,$vr1,$vr0 # 1 = i<<4
vsrli.w $vr1,$vr1,4 # 1 = i vsrli.w $vr1,$vr1,4 # 1 = i
vand.v $vr0,$vr0,$vr9 # 0 = k vand.v $vr0,$vr0,$vr9 # 0 = k
vshuf.b $vr5,$vr0,$vr5,$vr0 # 2 = a/k vshuf.b $vr5,$vr18,$vr5,$vr0 # 2 = a/k
vori.b $vr3,$vr10,0 # 3 : 1/i vori.b $vr3,$vr10,0 # 3 : 1/i
vxor.v $vr0,$vr0,$vr1 # 0 = j vxor.v $vr0,$vr0,$vr1 # 0 = j
vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
vori.b $vr4,$vr10,0 # 4 : 1/j vori.b $vr4,$vr10,0 # 4 : 1/j
vxor.v $vr3,$vr3,$vr5 # 3 = iak = 1/i + a/k vxor.v $vr3,$vr3,$vr5 # 3 = iak = 1/i + a/k
vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
vori.b $vr2,$vr10,0 # 2 : 1/iak vori.b $vr2,$vr10,0 # 2 : 1/iak
vxor.v $vr4,$vr4,$vr5 # 4 = jak = 1/j + a/k vxor.v $vr4,$vr4,$vr5 # 4 = jak = 1/j + a/k
vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
vori.b $vr3,$vr10,0 # 3 : 1/jak vori.b $vr3,$vr10,0 # 3 : 1/jak
vxor.v $vr2,$vr2,$vr0 # 2 = io vxor.v $vr2,$vr2,$vr0 # 2 = io
vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
vld $vr5,$a5, 0 vld $vr5,$a5,0
vxor.v $vr3,$vr3,$vr1 # 3 = jo vxor.v $vr3,$vr3,$vr1 # 3 = jo
bnez $t5,.Lenc_loop bnez $t5,.Lenc_loop
# middle of last round # middle of last round
vld $vr4,$a6, -0x60 # 3 : sbou Lk_sbo vld $vr4,$a6, -0x60 # 3 : sbou Lk_sbo
vld $vr0,$a6, -0x50 # 0 : sbot Lk_sbo+16 vld $vr0,$a6, -0x50 # 0 : sbot Lk_sbo+16
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k vxor.v $vr4,$vr4,$vr5 # 4 = sb1u + k
vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
add.d $t0,$a7,$a6 # Lk_sr[] add.d $t0,$a7,$a6 # Lk_sr[]
vld $vr1,$t0, 0x40 vld $vr1,$t0,0x40
vxor.v $vr0,$vr0,$vr4 # 0 = A vxor.v $vr0,$vr0,$vr4 # 0 = A
vshuf.b $vr0,$vr1,$vr0,$vr1 vshuf.b $vr0,$vr18,$vr0,$vr1
jr $ra jr $ra
.cfi_endproc .cfi_endproc
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
@ -163,11 +163,11 @@ _vpaes_decrypt_core:
vld $vr5,$a5,0 # round0 key vld $vr5,$a5,0 # round0 key
slli.d $a7,$a7,4 slli.d $a7,$a7,4
vand.v $vr0,$vr9,$vr0 vand.v $vr0,$vr9,$vr0
vshuf.b $vr2,$vr0,$vr2,$vr0 vshuf.b $vr2,$vr18,$vr2,$vr0
vld $vr0,$t0,16 # ipthi vld $vr0,$t0,16 # ipthi
xori $a7,$a7,0x30 xori $a7,$a7,0x30
la.local $a6,Lk_dsbd la.local $a6,Lk_dsbd
vshuf.b $vr0,$vr1,$vr0,$vr1 vshuf.b $vr0,$vr18,$vr0,$vr1
andi $a7,$a7,0x30 andi $a7,$a7,0x30
vxor.v $vr2,$vr2,$vr5 vxor.v $vr2,$vr2,$vr5
la.local $t0,Lk_mc_forward la.local $t0,Lk_mc_forward
@ -184,29 +184,29 @@ _vpaes_decrypt_core:
## ##
vld $vr4,$a6,-0x20 # 4 : sb9u vld $vr4,$a6,-0x20 # 4 : sb9u
vld $vr1,$a6,-0x10 # 0 : sb9t vld $vr1,$a6,-0x10 # 0 : sb9t
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sb9u vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sb9u
vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sb9t vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sb9t
vxor.v $vr0,$vr0,$vr4 vxor.v $vr0,$vr0,$vr4
vld $vr4,$a6,0x0 # 4 : sbdu vld $vr4,$a6,0x0 # 4 : sbdu
vxor.v $vr0,$vr0,$vr1 # 0 = ch vxor.v $vr0,$vr0,$vr1 # 0 = ch
vld $vr1,$a6,0x10 # 0 : sbdt vld $vr1,$a6,0x10 # 0 : sbdt
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbdu vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbdu
vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbdt vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbdt
vxor.v $vr0,$vr0,$vr4 # 4 = ch vxor.v $vr0,$vr0,$vr4 # 4 = ch
vld $vr4,$a6,0x20 # 4 : sbbu vld $vr4,$a6,0x20 # 4 : sbbu
vxor.v $vr0,$vr0,$vr1 # 0 = ch vxor.v $vr0,$vr0,$vr1 # 0 = ch
vld $vr1,$a6,0x30 # 0 : sbbt vld $vr1,$a6,0x30 # 0 : sbbt
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbbu vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbbu
vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbbt vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbbt
vxor.v $vr0,$vr0,$vr4 # 4 = ch vxor.v $vr0,$vr0,$vr4 # 4 = ch
vld $vr4,$a6,0x40 # 4 : sbeu vld $vr4,$a6,0x40 # 4 : sbeu
vxor.v $vr0,$vr0,$vr1 # 0 = ch vxor.v $vr0,$vr0,$vr1 # 0 = ch
vld $vr1,$a6,0x50 # 0 : sbet vld $vr1,$a6,0x50 # 0 : sbet
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbeu vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbeu
vshuf.b $vr0,$vr5,$vr0,$vr5 # MC ch vshuf.b $vr0,$vr18,$vr0,$vr5 # MC ch
vshuf.b $vr1,$vr3,$vr1,$vr3 # 0 = sbet vshuf.b $vr1,$vr18,$vr1,$vr3 # 0 = sbet
vxor.v $vr0,$vr0,$vr4 # 4 = ch vxor.v $vr0,$vr0,$vr4 # 4 = ch
addi.d $a5,$a5, 16 # next round key addi.d $a5,$a5, 16 # next round key
vbsrl.v $vr16,$vr5,0xc vbsrl.v $vr16,$vr5,0xc
@ -222,32 +222,32 @@ _vpaes_decrypt_core:
vori.b $vr2,$vr11,0 # 2 : a/k vori.b $vr2,$vr11,0 # 2 : a/k
vsrli.w $vr1,$vr1,4 # 1 = i vsrli.w $vr1,$vr1,4 # 1 = i
vand.v $vr0,$vr0,$vr9 # 0 = k vand.v $vr0,$vr0,$vr9 # 0 = k
vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k
vori.b $vr3,$vr10,0 # 3 : 1/i vori.b $vr3,$vr10,0 # 3 : 1/i
vxor.v $vr0,$vr0,$vr1 # 0 = j vxor.v $vr0,$vr0,$vr1 # 0 = j
vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
vori.b $vr4,$vr10,0 # 4 : 1/j vori.b $vr4,$vr10,0 # 4 : 1/j
vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
vori.b $vr2,$vr10,0 # 2 : 1/iak vori.b $vr2,$vr10,0 # 2 : 1/iak
vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
vori.b $vr3,$vr10,0 # 3 : 1/jak vori.b $vr3,$vr10,0 # 3 : 1/jak
vxor.v $vr2,$vr2,$vr0 # 2 = io vxor.v $vr2,$vr2,$vr0 # 2 = io
vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
vld $vr0,$a5,0 vld $vr0,$a5,0
vxor.v $vr3,$vr3,$vr1 # 3 = jo vxor.v $vr3,$vr3,$vr1 # 3 = jo
bnez $t5,.Ldec_loop bnez $t5,.Ldec_loop
# middle of last round # middle of last round
vld $vr4,$a6,0x60 # 3 : sbou vld $vr4,$a6,0x60 # 3 : sbou
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
vxor.v $vr4,$vr4,$vr0 # 4 = sb1u + k vxor.v $vr4,$vr4,$vr0 # 4 = sb1u + k
vld $vr0,$a6,0x70 # 0 : sbot vld $vr0,$a6,0x70 # 0 : sbot
vld $vr2,$a7,-0x160 # Lk_sr-.Lk_dsbd=-0x160 vld $vr2,$a7,-0x160 # Lk_sr-.Lk_dsbd=-0x160
vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
vxor.v $vr0,$vr0,$vr4 # 0 = A vxor.v $vr0,$vr0,$vr4 # 0 = A
vshuf.b $vr0,$vr2,$vr0,$vr2 vshuf.b $vr0,$vr18,$vr0,$vr2
jr $ra jr $ra
.cfi_endproc .cfi_endproc
.size _vpaes_decrypt_core,.-_vpaes_decrypt_core .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
@ -292,7 +292,7 @@ _vpaes_schedule_core:
# decrypting, output zeroth round key after shiftrows # decrypting, output zeroth round key after shiftrows
add.d $t2,$a4,$a6 add.d $t2,$a4,$a6
vld $vr1,$t2,0 vld $vr1,$t2,0
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
vst $vr3,$a2,0 vst $vr3,$a2,0
xori $a4,$a4,0x30 xori $a4,$a4,0x30
@ -415,7 +415,7 @@ _vpaes_schedule_core:
# encrypting # encrypting
add.d $t0,$a4,$a6 add.d $t0,$a4,$a6
vld $vr1,$t0,0 vld $vr1,$t0,0
vshuf.b $vr0,$vr1,$vr0,$vr1 # output permute vshuf.b $vr0,$vr18,$vr0,$vr1 # output permute
la.local $a7,Lk_opt # prepare to output transform la.local $a7,Lk_opt # prepare to output transform
addi.d $a2,$a2,32 addi.d $a2,$a2,32
@ -530,24 +530,24 @@ _vpaes_schedule_low_round:
vsrli.w $vr1,$vr1,0x4 # 1 = i vsrli.w $vr1,$vr1,0x4 # 1 = i
vand.v $vr0,$vr0,$vr9 # 0 = k vand.v $vr0,$vr0,$vr9 # 0 = k
vaddi.du $vr2,$vr11,0x0 # 2 : a/k vaddi.du $vr2,$vr11,0x0 # 2 : a/k
vshuf.b $vr2,$vr0,$vr2,$vr0 # 2 = a/k vshuf.b $vr2,$vr18,$vr2,$vr0 # 2 = a/k
vxor.v $vr0,$vr0,$vr1 # 0 = j vxor.v $vr0,$vr0,$vr1 # 0 = j
vaddi.du $vr3,$vr10,0x0 # 3 : 1/i vaddi.du $vr3,$vr10,0x0 # 3 : 1/i
vshuf.b $vr3,$vr1,$vr3,$vr1 # 3 = 1/i vshuf.b $vr3,$vr18,$vr3,$vr1 # 3 = 1/i
vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k vxor.v $vr3,$vr3,$vr2 # 3 = iak = 1/i + a/k
vaddi.du $vr4,$vr10,0x0 # 4 : 1/j vaddi.du $vr4,$vr10,0x0 # 4 : 1/j
vshuf.b $vr4,$vr0,$vr4,$vr0 # 4 = 1/j vshuf.b $vr4,$vr18,$vr4,$vr0 # 4 = 1/j
vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k vxor.v $vr4,$vr4,$vr2 # 4 = jak = 1/j + a/k
vaddi.du $vr2,$vr10,0x0 # 2 : 1/iak vaddi.du $vr2,$vr10,0x0 # 2 : 1/iak
vshuf.b $vr2,$vr3,$vr2,$vr3 # 2 = 1/iak vshuf.b $vr2,$vr18,$vr2,$vr3 # 2 = 1/iak
vxor.v $vr2,$vr2,$vr0 # 2 = io vxor.v $vr2,$vr2,$vr0 # 2 = io
vaddi.du $vr3,$vr10,0x0 # 3 : 1/jak vaddi.du $vr3,$vr10,0x0 # 3 : 1/jak
vshuf.b $vr3,$vr4,$vr3,$vr4 # 3 = 1/jak vshuf.b $vr3,$vr18,$vr3,$vr4 # 3 = 1/jak
vxor.v $vr3,$vr3,$vr1 # 3 = jo vxor.v $vr3,$vr3,$vr1 # 3 = jo
vaddi.du $vr4,$vr13,0x0 # 4 : sbou vaddi.du $vr4,$vr13,0x0 # 4 : sbou
vshuf.b $vr4,$vr2,$vr4,$vr2 # 4 = sbou vshuf.b $vr4,$vr18,$vr4,$vr2 # 4 = sbou
vaddi.du $vr0,$vr12,0x0 # 0 : sbot vaddi.du $vr0,$vr12,0x0 # 0 : sbot
vshuf.b $vr0,$vr3,$vr0,$vr3 # 0 = sb1t vshuf.b $vr0,$vr18,$vr0,$vr3 # 0 = sb1t
vxor.v $vr0,$vr0,$vr4 # 0 = sbox output vxor.v $vr0,$vr0,$vr4 # 0 = sbox output
# add in smeared stuff # add in smeared stuff
@ -575,9 +575,9 @@ _vpaes_schedule_transform:
vsrli.w $vr1,$vr1,4 vsrli.w $vr1,$vr1,4
vand.v $vr0,$vr0,$vr9 vand.v $vr0,$vr0,$vr9
vld $vr2,$a7,0 # lo vld $vr2,$a7,0 # lo
vshuf.b $vr2,$vr0,$vr2,$vr0 vshuf.b $vr2,$vr18,$vr2,$vr0
vld $vr0,$a7,16 # hi vld $vr0,$a7,16 # hi
vshuf.b $vr0,$vr1,$vr0,$vr1 vshuf.b $vr0,$vr18,$vr0,$vr1
vxor.v $vr0,$vr0,$vr2 vxor.v $vr0,$vr0,$vr2
jr $ra jr $ra
.cfi_endproc .cfi_endproc
@ -620,11 +620,11 @@ _vpaes_schedule_mangle:
la.local $t0,Lk_s63 la.local $t0,Lk_s63
vld $vr16,$t0,0 vld $vr16,$t0,0
vxor.v $vr4,$vr4,$vr16 vxor.v $vr4,$vr4,$vr16
vshuf.b $vr4,$vr5,$vr4,$vr5 vshuf.b $vr4,$vr18,$vr4,$vr5
vori.b $vr3,$vr4,0 vori.b $vr3,$vr4,0
vshuf.b $vr4,$vr5,$vr4,$vr5 vshuf.b $vr4,$vr18,$vr4,$vr5
vxor.v $vr3,$vr3,$vr4 vxor.v $vr3,$vr3,$vr4
vshuf.b $vr4,$vr5,$vr4,$vr5 vshuf.b $vr4,$vr18,$vr4,$vr5
vxor.v $vr3,$vr3,$vr4 vxor.v $vr3,$vr3,$vr4
b .Lschedule_mangle_both b .Lschedule_mangle_both
@ -638,33 +638,33 @@ _vpaes_schedule_mangle:
vand.v $vr4,$vr4,$vr9 # 4 = lo vand.v $vr4,$vr4,$vr9 # 4 = lo
vld $vr2,$a7,0 vld $vr2,$a7,0
vshuf.b $vr2,$vr4,$vr2,$vr4 vshuf.b $vr2,$vr18,$vr2,$vr4
vld $vr3,$a7,0x10 vld $vr3,$a7,0x10
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
vxor.v $vr3,$vr3,$vr2 vxor.v $vr3,$vr3,$vr2
vshuf.b $vr3,$vr5,$vr3,$vr5 vshuf.b $vr3,$vr18,$vr3,$vr5
vld $vr2,$a7,0x20 vld $vr2,$a7,0x20
vshuf.b $vr2,$vr4,$vr2,$vr4 vshuf.b $vr2,$vr18,$vr2,$vr4
vxor.v $vr2,$vr2,$vr3 vxor.v $vr2,$vr2,$vr3
vld $vr3,$a7,0x30 vld $vr3,$a7,0x30
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
vxor.v $vr3,$vr3,$vr2 vxor.v $vr3,$vr3,$vr2
vshuf.b $vr3,$vr5,$vr3,$vr5 vshuf.b $vr3,$vr18,$vr3,$vr5
vld $vr2,$a7,0x40 vld $vr2,$a7,0x40
vshuf.b $vr2,$vr4,$vr2,$vr4 vshuf.b $vr2,$vr18,$vr2,$vr4
vxor.v $vr2,$vr2,$vr3 vxor.v $vr2,$vr2,$vr3
vld $vr3,$a7,0x50 vld $vr3,$a7,0x50
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
vxor.v $vr3,$vr3,$vr2 vxor.v $vr3,$vr3,$vr2
vshuf.b $vr3,$vr5,$vr3,$vr5 vshuf.b $vr3,$vr18,$vr3,$vr5
vld $vr2,$a7,0x60 vld $vr2,$a7,0x60
vshuf.b $vr2,$vr4,$vr2,$vr4 vshuf.b $vr2,$vr18,$vr2,$vr4
vxor.v $vr2,$vr2,$vr3 vxor.v $vr2,$vr2,$vr3
vld $vr3,$a7,0x70 vld $vr3,$a7,0x70
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
vxor.v $vr3,$vr3,$vr2 vxor.v $vr3,$vr3,$vr2
addi.d $a2,$a2,-16 addi.d $a2,$a2,-16
@ -672,7 +672,7 @@ _vpaes_schedule_mangle:
.Lschedule_mangle_both: .Lschedule_mangle_both:
add.d $t2,$a4,$a6 add.d $t2,$a4,$a6
vld $vr1,$t2,0 vld $vr1,$t2,0
vshuf.b $vr3,$vr1,$vr3,$vr1 vshuf.b $vr3,$vr18,$vr3,$vr1
addi.d $a4,$a4,-16 addi.d $a4,$a4,-16
andi $a4,$a4,0x30 andi $a4,$a4,0x30
vst $vr3,$a2,0 vst $vr3,$a2,0
@ -885,6 +885,7 @@ _vpaes_preheat:
vld $vr12,$a6,0x40 # Lk_sb1+16 vld $vr12,$a6,0x40 # Lk_sb1+16
vld $vr15,$a6,0x50 # Lk_sb2 vld $vr15,$a6,0x50 # Lk_sb2
vld $vr14,$a6,0x60 # Lk_sb2+16 vld $vr14,$a6,0x60 # Lk_sb2+16
vldi $vr18,0 # $vr18 in this program is equal to 0
jirl $zero,$ra,0 jirl $zero,$ra,0
.cfi_endproc .cfi_endproc
.size _vpaes_preheat,.-_vpaes_preheat .size _vpaes_preheat,.-_vpaes_preheat
@ -899,8 +900,8 @@ $code.=<<___;
.section .rodata .section .rodata
.align 6 .align 6
Lk_inv: # inv, inva Lk_inv: # inv, inva
.quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x0E05060F0D080110, 0x040703090A0B0C02
.quad 0x01040A060F0B0780, 0x030D0E0C02050809 .quad 0x01040A060F0B0710, 0x030D0E0C02050809
Lk_s0F: # s0F Lk_s0F: # s0F
.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F