mirror of https://github.com/openssl/openssl.git
				
				
				
			Fixed incorrect usage of vshuf.b instruction
In the definition of the latest revised LoongArch64 vector instruction manual, it is clearly pointed out that the undefined upper three bits of each byte in the control register of the vshuf.b instruction should not be used, otherwise uncertain results may be obtained. Therefore, it is necessary to correct the use of the vshuf.b instruction in the existing vpaes-loongarch64.pl code to avoid erroneous calculation results in future LoongArch64 processors. Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/21530)
This commit is contained in:
		
							parent
							
								
									160f48941d
								
							
						
					
					
						commit
						780ce3849f
					
				|  | @ -62,14 +62,14 @@ _vpaes_encrypt_core: | ||||||
|     ld.w      $t5,$a2,240 |     ld.w      $t5,$a2,240 | ||||||
|     vori.b    $vr1,$vr9,0 |     vori.b    $vr1,$vr9,0 | ||||||
|     la.local  $t0,Lk_ipt |     la.local  $t0,Lk_ipt | ||||||
|     vld	      $vr2,$t0,0    # iptlo |     vld       $vr2,$t0,0    # iptlo | ||||||
|     vandn.v   $vr1,$vr1,$vr0 |     vandn.v   $vr1,$vr1,$vr0 | ||||||
|     vld	      $vr5,$a5,0    # round0 key |     vld	      $vr5,$a5,0    # round0 key | ||||||
|     vsrli.w   $vr1,$vr1,4 |     vsrli.w   $vr1,$vr1,4 | ||||||
|     vand.v    $vr0,$vr0,$vr9 |     vand.v    $vr0,$vr0,$vr9 | ||||||
|     vshuf.b   $vr2,$vr0,$vr2,$vr0 |     vshuf.b   $vr2,$vr18,$vr2,$vr0 | ||||||
|     vld       $vr0,$t0,16   # ipthi |     vld       $vr0,$t0,16   # ipthi | ||||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 |     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||||
|     vxor.v    $vr2,$vr2,$vr5 |     vxor.v    $vr2,$vr2,$vr5 | ||||||
|     addi.d    $a5,$a5,16 |     addi.d    $a5,$a5,16 | ||||||
|     vxor.v    $vr0,$vr0,$vr2 |     vxor.v    $vr0,$vr0,$vr2 | ||||||
|  | @ -81,26 +81,26 @@ _vpaes_encrypt_core: | ||||||
|     # middle of middle round |     # middle of middle round | ||||||
|     vori.b    $vr4,$vr13,0           # 4 : sb1u |     vori.b    $vr4,$vr13,0           # 4 : sb1u | ||||||
|     vori.b    $vr0,$vr12,0           # 0 : sb1t |     vori.b    $vr0,$vr12,0           # 0 : sb1t | ||||||
|     vshuf.b   $vr4,$vr2,$vr4,$vr2    # 4 = sb1u |     vshuf.b   $vr4,$vr18,$vr4,$vr2    # 4 = sb1u | ||||||
|     vshuf.b   $vr0,$vr3,$vr0,$vr3    # 0 = sb1t |     vshuf.b   $vr0,$vr18,$vr0,$vr3    # 0 = sb1t | ||||||
|     vxor.v    $vr4,$vr4,$vr5         # 4 = sb1u + k |     vxor.v    $vr4,$vr4,$vr5         # 4 = sb1u + k | ||||||
|     vori.b    $vr5,$vr15,0           # 4 : sb2u |     vori.b    $vr5,$vr15,0           # 4 : sb2u | ||||||
|     vxor.v    $vr0,$vr0,$vr4         # 0 = A |     vxor.v    $vr0,$vr0,$vr4         # 0 = A | ||||||
|     add.d     $t0,$a7,$a6            # Lk_mc_forward[] |     add.d     $t0,$a7,$a6            # Lk_mc_forward[] | ||||||
|     vld       $vr1,$t0,-0x40 |     vld       $vr1,$t0,-0x40 | ||||||
|     vshuf.b   $vr5,$vr2,$vr5,$vr2    # 4 = sb2u |     vshuf.b   $vr5,$vr18,$vr5,$vr2    # 4 = sb2u | ||||||
|     vld       $vr4,$t0,0             # Lk_mc_backward[] |     vld       $vr4,$t0,0             # Lk_mc_backward[] | ||||||
|     vori.b    $vr2,$vr14,0           # 2 : sb2t |     vori.b    $vr2,$vr14,0           # 2 : sb2t | ||||||
|     vshuf.b   $vr2,$vr3,$vr2,$vr3    # 2 = sb2t |     vshuf.b   $vr2,$vr18,$vr2,$vr3    # 2 = sb2t | ||||||
|     vori.b    $vr3,$vr0,0            # 3 = A |     vori.b    $vr3,$vr0,0            # 3 = A | ||||||
|     vxor.v    $vr2,$vr5,$vr2         # 2 = 2A |     vxor.v    $vr2,$vr5,$vr2         # 2 = 2A | ||||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1    # 0 = B |     vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = B | ||||||
|     addi.d    $a5,$a5,16             # next key |     addi.d    $a5,$a5,16             # next key | ||||||
|     vxor.v    $vr0,$vr0,$vr2         # 0 = 2A+B |     vxor.v    $vr0,$vr0,$vr2         # 0 = 2A+B | ||||||
|     vshuf.b   $vr3,$vr4,$vr3,$vr4    # 3 = D |     vshuf.b   $vr3,$vr18,$vr3,$vr4    # 3 = D | ||||||
|     addi.d    $a7,$a7,16             # next mc |     addi.d    $a7,$a7,16             # next mc | ||||||
|     vxor.v    $vr3,$vr3,$vr0         # 3 = 2A+B+D |     vxor.v    $vr3,$vr3,$vr0         # 3 = 2A+B+D | ||||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1    # 0 = 2B+C |     vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = 2B+C | ||||||
|     andi      $a7,$a7,0x30           # ... mod 4 |     andi      $a7,$a7,0x30           # ... mod 4 | ||||||
|     addi.d    $t5,$t5,-1             # nr-- |     addi.d    $t5,$t5,-1             # nr-- | ||||||
|     vxor.v    $vr0,$vr0,$vr3         # 0 = 2A+3B+C+D |     vxor.v    $vr0,$vr0,$vr3         # 0 = 2A+3B+C+D | ||||||
|  | @ -112,33 +112,33 @@ _vpaes_encrypt_core: | ||||||
|     vandn.v   $vr1,$vr1,$vr0        # 1 = i<<4 |     vandn.v   $vr1,$vr1,$vr0        # 1 = i<<4 | ||||||
|     vsrli.w   $vr1,$vr1,4           # 1 = i |     vsrli.w   $vr1,$vr1,4           # 1 = i | ||||||
|     vand.v    $vr0,$vr0,$vr9        # 0 = k |     vand.v    $vr0,$vr0,$vr9        # 0 = k | ||||||
|     vshuf.b   $vr5,$vr0,$vr5,$vr0   # 2 = a/k |     vshuf.b   $vr5,$vr18,$vr5,$vr0   # 2 = a/k | ||||||
|     vori.b    $vr3,$vr10,0          # 3 : 1/i |     vori.b    $vr3,$vr10,0          # 3 : 1/i | ||||||
|     vxor.v    $vr0,$vr0,$vr1        # 0 = j |     vxor.v    $vr0,$vr0,$vr1        # 0 = j | ||||||
|     vshuf.b   $vr3,$vr1,$vr3,$vr1   # 3 = 1/i |     vshuf.b   $vr3,$vr18,$vr3,$vr1   # 3 = 1/i | ||||||
|     vori.b    $vr4,$vr10,0          # 4 : 1/j |     vori.b    $vr4,$vr10,0          # 4 : 1/j | ||||||
|     vxor.v    $vr3,$vr3,$vr5        # 3 = iak = 1/i + a/k |     vxor.v    $vr3,$vr3,$vr5        # 3 = iak = 1/i + a/k | ||||||
|     vshuf.b   $vr4,$vr0,$vr4,$vr0   # 4 = 1/j |     vshuf.b   $vr4,$vr18,$vr4,$vr0   # 4 = 1/j | ||||||
|     vori.b    $vr2,$vr10,0          # 2 : 1/iak |     vori.b    $vr2,$vr10,0          # 2 : 1/iak | ||||||
|     vxor.v    $vr4,$vr4,$vr5        # 4 = jak = 1/j + a/k |     vxor.v    $vr4,$vr4,$vr5        # 4 = jak = 1/j + a/k | ||||||
|     vshuf.b   $vr2,$vr3,$vr2,$vr3   # 2 = 1/iak |     vshuf.b   $vr2,$vr18,$vr2,$vr3   # 2 = 1/iak | ||||||
|     vori.b    $vr3,$vr10,0          # 3 : 1/jak |     vori.b    $vr3,$vr10,0          # 3 : 1/jak | ||||||
|     vxor.v    $vr2,$vr2,$vr0        # 2 = io |     vxor.v    $vr2,$vr2,$vr0        # 2 = io | ||||||
|     vshuf.b   $vr3,$vr4,$vr3,$vr4   # 3 = 1/jak |     vshuf.b   $vr3,$vr18,$vr3,$vr4   # 3 = 1/jak | ||||||
|     vld       $vr5,$a5,	0 |     vld       $vr5,$a5,0 | ||||||
|     vxor.v    $vr3,$vr3,$vr1        # 3 = jo |     vxor.v    $vr3,$vr3,$vr1        # 3 = jo | ||||||
|     bnez      $t5,.Lenc_loop |     bnez      $t5,.Lenc_loop | ||||||
| 
 | 
 | ||||||
|     # middle of last round |     # middle of last round | ||||||
|     vld       $vr4,$a6,	-0x60		# 3 : sbou	Lk_sbo |     vld       $vr4,$a6,	-0x60		# 3 : sbou	Lk_sbo | ||||||
|     vld       $vr0,$a6,	-0x50		# 0 : sbot	Lk_sbo+16 |     vld       $vr0,$a6,	-0x50		# 0 : sbot	Lk_sbo+16 | ||||||
|     vshuf.b   $vr4,$vr2,$vr4,$vr2	# 4 = sbou |     vshuf.b   $vr4,$vr18,$vr4,$vr2	# 4 = sbou | ||||||
|     vxor.v    $vr4,$vr4,$vr5		# 4 = sb1u + k |     vxor.v    $vr4,$vr4,$vr5		# 4 = sb1u + k | ||||||
|     vshuf.b   $vr0,$vr3,$vr0,$vr3	# 0 = sb1t |     vshuf.b   $vr0,$vr18,$vr0,$vr3	# 0 = sb1t | ||||||
|     add.d     $t0,$a7,$a6		# Lk_sr[] |     add.d     $t0,$a7,$a6		# Lk_sr[] | ||||||
|     vld       $vr1,$t0,	0x40 |     vld       $vr1,$t0,0x40 | ||||||
|     vxor.v    $vr0,$vr0,$vr4		# 0 = A |     vxor.v    $vr0,$vr0,$vr4		# 0 = A | ||||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 |     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||||
|     jr        $ra |     jr        $ra | ||||||
| .cfi_endproc | .cfi_endproc | ||||||
| .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core | .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core | ||||||
|  | @ -163,11 +163,11 @@ _vpaes_decrypt_core: | ||||||
|     vld       $vr5,$a5,0               # round0 key |     vld       $vr5,$a5,0               # round0 key | ||||||
|     slli.d    $a7,$a7,4 |     slli.d    $a7,$a7,4 | ||||||
|     vand.v    $vr0,$vr9,$vr0 |     vand.v    $vr0,$vr9,$vr0 | ||||||
|     vshuf.b   $vr2,$vr0,$vr2,$vr0 |     vshuf.b   $vr2,$vr18,$vr2,$vr0 | ||||||
|     vld       $vr0,$t0,16              # ipthi |     vld       $vr0,$t0,16              # ipthi | ||||||
|     xori      $a7,$a7,0x30 |     xori      $a7,$a7,0x30 | ||||||
|     la.local  $a6,Lk_dsbd |     la.local  $a6,Lk_dsbd | ||||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 |     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||||
|     andi      $a7,$a7,0x30 |     andi      $a7,$a7,0x30 | ||||||
|     vxor.v    $vr2,$vr2,$vr5 |     vxor.v    $vr2,$vr2,$vr5 | ||||||
|     la.local  $t0,Lk_mc_forward |     la.local  $t0,Lk_mc_forward | ||||||
|  | @ -184,29 +184,29 @@ _vpaes_decrypt_core: | ||||||
| ## | ## | ||||||
|     vld        $vr4,$a6,-0x20		# 4 : sb9u |     vld        $vr4,$a6,-0x20		# 4 : sb9u | ||||||
|     vld        $vr1,$a6,-0x10		# 0 : sb9t |     vld        $vr1,$a6,-0x10		# 0 : sb9t | ||||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sb9u |     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sb9u | ||||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sb9t |     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sb9t | ||||||
|     vxor.v     $vr0,$vr0,$vr4 |     vxor.v     $vr0,$vr0,$vr4 | ||||||
|     vld        $vr4,$a6,0x0		# 4 : sbdu |     vld        $vr4,$a6,0x0		# 4 : sbdu | ||||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch |     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||||
|     vld        $vr1,$a6,0x10		# 0 : sbdt |     vld        $vr1,$a6,0x10		# 0 : sbdt | ||||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbdu |     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbdu | ||||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch |     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbdt |     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbdt | ||||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch |     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||||
|     vld        $vr4,$a6,0x20		# 4 : sbbu |     vld        $vr4,$a6,0x20		# 4 : sbbu | ||||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch |     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||||
|     vld        $vr1,$a6,0x30		# 0 : sbbt |     vld        $vr1,$a6,0x30		# 0 : sbbt | ||||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbbu |     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbbu | ||||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch |     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbbt |     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbbt | ||||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch |     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||||
|     vld        $vr4,$a6,0x40		# 4 : sbeu |     vld        $vr4,$a6,0x40		# 4 : sbeu | ||||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch |     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||||
|     vld        $vr1,$a6,0x50		# 0 : sbet |     vld        $vr1,$a6,0x50		# 0 : sbet | ||||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbeu |     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbeu | ||||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch |     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbet |     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbet | ||||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch |     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||||
|     addi.d     $a5,$a5,	16		# next round key |     addi.d     $a5,$a5,	16		# next round key | ||||||
|     vbsrl.v    $vr16,$vr5,0xc |     vbsrl.v    $vr16,$vr5,0xc | ||||||
|  | @ -222,32 +222,32 @@ _vpaes_decrypt_core: | ||||||
|     vori.b     $vr2,$vr11,0		# 2 : a/k |     vori.b     $vr2,$vr11,0		# 2 : a/k | ||||||
|     vsrli.w    $vr1,$vr1,4		# 1 = i |     vsrli.w    $vr1,$vr1,4		# 1 = i | ||||||
|     vand.v     $vr0,$vr0,$vr9		# 0 = k |     vand.v     $vr0,$vr0,$vr9		# 0 = k | ||||||
|     vshuf.b    $vr2,$vr0,$vr2,$vr0	# 2 = a/k |     vshuf.b    $vr2,$vr18,$vr2,$vr0	# 2 = a/k | ||||||
|     vori.b     $vr3,$vr10,0		# 3 : 1/i |     vori.b     $vr3,$vr10,0		# 3 : 1/i | ||||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = j |     vxor.v     $vr0,$vr0,$vr1		# 0 = j | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1	# 3 = 1/i |     vshuf.b    $vr3,$vr18,$vr3,$vr1	# 3 = 1/i | ||||||
|     vori.b     $vr4,$vr10,0		# 4 : 1/j |     vori.b     $vr4,$vr10,0		# 4 : 1/j | ||||||
|     vxor.v     $vr3,$vr3,$vr2		# 3 = iak = 1/i + a/k |     vxor.v     $vr3,$vr3,$vr2		# 3 = iak = 1/i + a/k | ||||||
|     vshuf.b    $vr4,$vr0,$vr4,$vr0	# 4 = 1/j |     vshuf.b    $vr4,$vr18,$vr4,$vr0	# 4 = 1/j | ||||||
|     vxor.v     $vr4,$vr4,$vr2		# 4 = jak = 1/j + a/k |     vxor.v     $vr4,$vr4,$vr2		# 4 = jak = 1/j + a/k | ||||||
|     vori.b     $vr2,$vr10,0		# 2 : 1/iak |     vori.b     $vr2,$vr10,0		# 2 : 1/iak | ||||||
|     vshuf.b    $vr2,$vr3,$vr2,$vr3	# 2 = 1/iak |     vshuf.b    $vr2,$vr18,$vr2,$vr3	# 2 = 1/iak | ||||||
|     vori.b     $vr3,$vr10,0		# 3 : 1/jak |     vori.b     $vr3,$vr10,0		# 3 : 1/jak | ||||||
|     vxor.v     $vr2,$vr2,$vr0		# 2 = io |     vxor.v     $vr2,$vr2,$vr0		# 2 = io | ||||||
|     vshuf.b    $vr3,$vr4,$vr3,$vr4	# 3 = 1/jak |     vshuf.b    $vr3,$vr18,$vr3,$vr4	# 3 = 1/jak | ||||||
|     vld        $vr0,$a5,0 |     vld        $vr0,$a5,0 | ||||||
|     vxor.v     $vr3,$vr3,$vr1		# 3 = jo |     vxor.v     $vr3,$vr3,$vr1		# 3 = jo | ||||||
|     bnez       $t5,.Ldec_loop |     bnez       $t5,.Ldec_loop | ||||||
| 
 | 
 | ||||||
|     # middle of last round |     # middle of last round | ||||||
|     vld        $vr4,$a6,0x60		# 3 : sbou |     vld        $vr4,$a6,0x60		# 3 : sbou | ||||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbou |     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbou | ||||||
|     vxor.v     $vr4,$vr4,$vr0		# 4 = sb1u + k |     vxor.v     $vr4,$vr4,$vr0		# 4 = sb1u + k | ||||||
|     vld        $vr0,$a6,0x70		# 0 : sbot |     vld        $vr0,$a6,0x70		# 0 : sbot | ||||||
|     vld        $vr2,$a7,-0x160		# Lk_sr-.Lk_dsbd=-0x160 |     vld        $vr2,$a7,-0x160		# Lk_sr-.Lk_dsbd=-0x160 | ||||||
|     vshuf.b    $vr0,$vr3,$vr0,$vr3	# 0 = sb1t |     vshuf.b    $vr0,$vr18,$vr0,$vr3	# 0 = sb1t | ||||||
|     vxor.v     $vr0,$vr0,$vr4		# 0 = A |     vxor.v     $vr0,$vr0,$vr4		# 0 = A | ||||||
|     vshuf.b    $vr0,$vr2,$vr0,$vr2 |     vshuf.b    $vr0,$vr18,$vr0,$vr2 | ||||||
|     jr         $ra |     jr         $ra | ||||||
| .cfi_endproc | .cfi_endproc | ||||||
| .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core | .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core | ||||||
|  | @ -292,7 +292,7 @@ _vpaes_schedule_core: | ||||||
|     # decrypting, output zeroth round key after shiftrows |     # decrypting, output zeroth round key after shiftrows | ||||||
|     add.d     $t2,$a4,$a6 |     add.d     $t2,$a4,$a6 | ||||||
|     vld       $vr1,$t2,0 |     vld       $vr1,$t2,0 | ||||||
|     vshuf.b   $vr3,$vr1,$vr3,$vr1 |     vshuf.b   $vr3,$vr18,$vr3,$vr1 | ||||||
|     vst       $vr3,$a2,0 |     vst       $vr3,$a2,0 | ||||||
|     xori      $a4,$a4,0x30 |     xori      $a4,$a4,0x30 | ||||||
| 
 | 
 | ||||||
|  | @ -415,7 +415,7 @@ _vpaes_schedule_core: | ||||||
|      # encrypting |      # encrypting | ||||||
|      add.d      $t0,$a4,$a6 |      add.d      $t0,$a4,$a6 | ||||||
|      vld        $vr1,$t0,0 |      vld        $vr1,$t0,0 | ||||||
|      vshuf.b    $vr0,$vr1,$vr0,$vr1             # output permute |      vshuf.b    $vr0,$vr18,$vr0,$vr1             # output permute | ||||||
|      la.local   $a7,Lk_opt                      # prepare to output transform |      la.local   $a7,Lk_opt                      # prepare to output transform | ||||||
|      addi.d     $a2,$a2,32 |      addi.d     $a2,$a2,32 | ||||||
| 
 | 
 | ||||||
|  | @ -530,24 +530,24 @@ _vpaes_schedule_low_round: | ||||||
|     vsrli.w     $vr1,$vr1,0x4			# 1 = i |     vsrli.w     $vr1,$vr1,0x4			# 1 = i | ||||||
|     vand.v      $vr0,$vr0,$vr9			# 0 = k |     vand.v      $vr0,$vr0,$vr9			# 0 = k | ||||||
|     vaddi.du    $vr2,$vr11,0x0			# 2 : a/k |     vaddi.du    $vr2,$vr11,0x0			# 2 : a/k | ||||||
|     vshuf.b     $vr2,$vr0,$vr2,$vr0		# 2 = a/k |     vshuf.b     $vr2,$vr18,$vr2,$vr0		# 2 = a/k | ||||||
|     vxor.v      $vr0,$vr0,$vr1			# 0 = j |     vxor.v      $vr0,$vr0,$vr1			# 0 = j | ||||||
|     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/i |     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/i | ||||||
|     vshuf.b     $vr3,$vr1,$vr3,$vr1		# 3 = 1/i |     vshuf.b     $vr3,$vr18,$vr3,$vr1		# 3 = 1/i | ||||||
|     vxor.v      $vr3,$vr3,$vr2			# 3 = iak = 1/i + a/k |     vxor.v      $vr3,$vr3,$vr2			# 3 = iak = 1/i + a/k | ||||||
|     vaddi.du    $vr4,$vr10,0x0			# 4 : 1/j |     vaddi.du    $vr4,$vr10,0x0			# 4 : 1/j | ||||||
|     vshuf.b     $vr4,$vr0,$vr4,$vr0		# 4 = 1/j |     vshuf.b     $vr4,$vr18,$vr4,$vr0		# 4 = 1/j | ||||||
|     vxor.v      $vr4,$vr4,$vr2			# 4 = jak = 1/j + a/k |     vxor.v      $vr4,$vr4,$vr2			# 4 = jak = 1/j + a/k | ||||||
|     vaddi.du    $vr2,$vr10,0x0			# 2 : 1/iak |     vaddi.du    $vr2,$vr10,0x0			# 2 : 1/iak | ||||||
|     vshuf.b     $vr2,$vr3,$vr2,$vr3		# 2 = 1/iak |     vshuf.b     $vr2,$vr18,$vr2,$vr3		# 2 = 1/iak | ||||||
|     vxor.v      $vr2,$vr2,$vr0			# 2 = io |     vxor.v      $vr2,$vr2,$vr0			# 2 = io | ||||||
|     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/jak |     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/jak | ||||||
|     vshuf.b     $vr3,$vr4,$vr3,$vr4		# 3 = 1/jak |     vshuf.b     $vr3,$vr18,$vr3,$vr4		# 3 = 1/jak | ||||||
|     vxor.v      $vr3,$vr3,$vr1			# 3 = jo |     vxor.v      $vr3,$vr3,$vr1			# 3 = jo | ||||||
|     vaddi.du    $vr4,$vr13,0x0			# 4 : sbou |     vaddi.du    $vr4,$vr13,0x0			# 4 : sbou | ||||||
|     vshuf.b     $vr4,$vr2,$vr4,$vr2		# 4 = sbou |     vshuf.b     $vr4,$vr18,$vr4,$vr2		# 4 = sbou | ||||||
|     vaddi.du    $vr0,$vr12,0x0			# 0 : sbot |     vaddi.du    $vr0,$vr12,0x0			# 0 : sbot | ||||||
|     vshuf.b     $vr0,$vr3,$vr0,$vr3		# 0 = sb1t |     vshuf.b     $vr0,$vr18,$vr0,$vr3		# 0 = sb1t | ||||||
|     vxor.v      $vr0,$vr0,$vr4			# 0 = sbox output |     vxor.v      $vr0,$vr0,$vr4			# 0 = sbox output | ||||||
| 
 | 
 | ||||||
|     # add in smeared stuff |     # add in smeared stuff | ||||||
|  | @ -575,9 +575,9 @@ _vpaes_schedule_transform: | ||||||
|     vsrli.w    $vr1,$vr1,4 |     vsrli.w    $vr1,$vr1,4 | ||||||
|     vand.v     $vr0,$vr0,$vr9 |     vand.v     $vr0,$vr0,$vr9 | ||||||
|     vld        $vr2,$a7,0		# lo |     vld        $vr2,$a7,0		# lo | ||||||
|     vshuf.b    $vr2,$vr0,$vr2,$vr0 |     vshuf.b    $vr2,$vr18,$vr2,$vr0 | ||||||
|     vld        $vr0,$a7,16		# hi |     vld        $vr0,$a7,16		# hi | ||||||
|     vshuf.b    $vr0,$vr1,$vr0,$vr1 |     vshuf.b    $vr0,$vr18,$vr0,$vr1 | ||||||
|     vxor.v     $vr0,$vr0,$vr2 |     vxor.v     $vr0,$vr0,$vr2 | ||||||
|     jr         $ra |     jr         $ra | ||||||
| .cfi_endproc | .cfi_endproc | ||||||
|  | @ -620,11 +620,11 @@ _vpaes_schedule_mangle: | ||||||
|     la.local   $t0,Lk_s63 |     la.local   $t0,Lk_s63 | ||||||
|     vld        $vr16,$t0,0 |     vld        $vr16,$t0,0 | ||||||
|     vxor.v     $vr4,$vr4,$vr16 |     vxor.v     $vr4,$vr4,$vr16 | ||||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 |     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||||
|     vori.b     $vr3,$vr4,0 |     vori.b     $vr3,$vr4,0 | ||||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 |     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||||
|     vxor.v     $vr3,$vr3,$vr4 |     vxor.v     $vr3,$vr3,$vr4 | ||||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 |     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||||
|     vxor.v     $vr3,$vr3,$vr4 |     vxor.v     $vr3,$vr3,$vr4 | ||||||
| 
 | 
 | ||||||
|     b          .Lschedule_mangle_both |     b          .Lschedule_mangle_both | ||||||
|  | @ -638,33 +638,33 @@ _vpaes_schedule_mangle: | ||||||
|     vand.v     $vr4,$vr4,$vr9		# 4 = lo |     vand.v     $vr4,$vr4,$vr9		# 4 = lo | ||||||
| 
 | 
 | ||||||
|     vld        $vr2,$a7,0 |     vld        $vr2,$a7,0 | ||||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 |     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||||
|     vld        $vr3,$a7,0x10 |     vld        $vr3,$a7,0x10 | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 |     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||||
|     vxor.v     $vr3,$vr3,$vr2 |     vxor.v     $vr3,$vr3,$vr2 | ||||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 |     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||||
| 
 | 
 | ||||||
|     vld        $vr2,$a7,0x20 |     vld        $vr2,$a7,0x20 | ||||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 |     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||||
|     vxor.v     $vr2,$vr2,$vr3 |     vxor.v     $vr2,$vr2,$vr3 | ||||||
|     vld        $vr3,$a7,0x30 |     vld        $vr3,$a7,0x30 | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 |     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||||
|     vxor.v     $vr3,$vr3,$vr2 |     vxor.v     $vr3,$vr3,$vr2 | ||||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 |     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||||
| 
 | 
 | ||||||
|     vld        $vr2,$a7,0x40 |     vld        $vr2,$a7,0x40 | ||||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 |     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||||
|     vxor.v     $vr2,$vr2,$vr3 |     vxor.v     $vr2,$vr2,$vr3 | ||||||
|     vld        $vr3,$a7,0x50 |     vld        $vr3,$a7,0x50 | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 |     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||||
|     vxor.v     $vr3,$vr3,$vr2 |     vxor.v     $vr3,$vr3,$vr2 | ||||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 |     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||||
| 
 | 
 | ||||||
|     vld        $vr2,$a7,0x60 |     vld        $vr2,$a7,0x60 | ||||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 |     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||||
|     vxor.v     $vr2,$vr2,$vr3 |     vxor.v     $vr2,$vr2,$vr3 | ||||||
|     vld        $vr3,$a7,0x70 |     vld        $vr3,$a7,0x70 | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 |     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||||
|     vxor.v     $vr3,$vr3,$vr2 |     vxor.v     $vr3,$vr3,$vr2 | ||||||
| 
 | 
 | ||||||
|     addi.d     $a2,$a2,-16 |     addi.d     $a2,$a2,-16 | ||||||
|  | @ -672,7 +672,7 @@ _vpaes_schedule_mangle: | ||||||
| .Lschedule_mangle_both: | .Lschedule_mangle_both: | ||||||
|     add.d      $t2,$a4,$a6 |     add.d      $t2,$a4,$a6 | ||||||
|     vld        $vr1,$t2,0 |     vld        $vr1,$t2,0 | ||||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 |     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||||
|     addi.d     $a4,$a4,-16 |     addi.d     $a4,$a4,-16 | ||||||
|     andi       $a4,$a4,0x30 |     andi       $a4,$a4,0x30 | ||||||
|     vst        $vr3,$a2,0 |     vst        $vr3,$a2,0 | ||||||
|  | @ -885,6 +885,7 @@ _vpaes_preheat: | ||||||
|     vld       $vr12,$a6,0x40		# Lk_sb1+16 |     vld       $vr12,$a6,0x40		# Lk_sb1+16 | ||||||
|     vld       $vr15,$a6,0x50		# Lk_sb2 |     vld       $vr15,$a6,0x50		# Lk_sb2 | ||||||
|     vld       $vr14,$a6,0x60		# Lk_sb2+16 |     vld       $vr14,$a6,0x60		# Lk_sb2+16 | ||||||
|  |     vldi      $vr18,0                   # $vr18 in this program is equal to 0 | ||||||
|     jirl      $zero,$ra,0 |     jirl      $zero,$ra,0 | ||||||
| .cfi_endproc | .cfi_endproc | ||||||
| .size	_vpaes_preheat,.-_vpaes_preheat | .size	_vpaes_preheat,.-_vpaes_preheat | ||||||
|  | @ -899,8 +900,8 @@ $code.=<<___; | ||||||
| .section .rodata | .section .rodata | ||||||
| .align	6 | .align	6 | ||||||
| Lk_inv:	# inv, inva | Lk_inv:	# inv, inva | ||||||
|     .quad	0x0E05060F0D080180, 0x040703090A0B0C02 |     .quad	0x0E05060F0D080110, 0x040703090A0B0C02 | ||||||
|     .quad	0x01040A060F0B0780, 0x030D0E0C02050809 |     .quad	0x01040A060F0B0710, 0x030D0E0C02050809 | ||||||
| 
 | 
 | ||||||
| Lk_s0F:	# s0F | Lk_s0F:	# s0F | ||||||
|     .quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F |     .quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue