mirror of https://github.com/openssl/openssl.git
				
				
				
			Fixed incorrect usage of vshuf.b instruction
In the definition of the latest revised LoongArch64 vector instruction manual, it is clearly pointed out that the undefined upper three bits of each byte in the control register of the vshuf.b instruction should not be used, otherwise uncertain results may be obtained. Therefore, it is necessary to correct the use of the vshuf.b instruction in the existing vpaes-loongarch64.pl code to avoid erroneous calculation results in future LoongArch64 processors. Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/21530)
This commit is contained in:
		
							parent
							
								
									160f48941d
								
							
						
					
					
						commit
						780ce3849f
					
				|  | @ -67,9 +67,9 @@ _vpaes_encrypt_core: | |||
|     vld	      $vr5,$a5,0    # round0 key | ||||
|     vsrli.w   $vr1,$vr1,4 | ||||
|     vand.v    $vr0,$vr0,$vr9 | ||||
|     vshuf.b   $vr2,$vr0,$vr2,$vr0 | ||||
|     vshuf.b   $vr2,$vr18,$vr2,$vr0 | ||||
|     vld       $vr0,$t0,16   # ipthi | ||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||
|     vxor.v    $vr2,$vr2,$vr5 | ||||
|     addi.d    $a5,$a5,16 | ||||
|     vxor.v    $vr0,$vr0,$vr2 | ||||
|  | @ -81,26 +81,26 @@ _vpaes_encrypt_core: | |||
|     # middle of middle round | ||||
|     vori.b    $vr4,$vr13,0           # 4 : sb1u | ||||
|     vori.b    $vr0,$vr12,0           # 0 : sb1t | ||||
|     vshuf.b   $vr4,$vr2,$vr4,$vr2    # 4 = sb1u | ||||
|     vshuf.b   $vr0,$vr3,$vr0,$vr3    # 0 = sb1t | ||||
|     vshuf.b   $vr4,$vr18,$vr4,$vr2    # 4 = sb1u | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr3    # 0 = sb1t | ||||
|     vxor.v    $vr4,$vr4,$vr5         # 4 = sb1u + k | ||||
|     vori.b    $vr5,$vr15,0           # 4 : sb2u | ||||
|     vxor.v    $vr0,$vr0,$vr4         # 0 = A | ||||
|     add.d     $t0,$a7,$a6            # Lk_mc_forward[] | ||||
|     vld       $vr1,$t0,-0x40 | ||||
|     vshuf.b   $vr5,$vr2,$vr5,$vr2    # 4 = sb2u | ||||
|     vshuf.b   $vr5,$vr18,$vr5,$vr2    # 4 = sb2u | ||||
|     vld       $vr4,$t0,0             # Lk_mc_backward[] | ||||
|     vori.b    $vr2,$vr14,0           # 2 : sb2t | ||||
|     vshuf.b   $vr2,$vr3,$vr2,$vr3    # 2 = sb2t | ||||
|     vshuf.b   $vr2,$vr18,$vr2,$vr3    # 2 = sb2t | ||||
|     vori.b    $vr3,$vr0,0            # 3 = A | ||||
|     vxor.v    $vr2,$vr5,$vr2         # 2 = 2A | ||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1    # 0 = B | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = B | ||||
|     addi.d    $a5,$a5,16             # next key | ||||
|     vxor.v    $vr0,$vr0,$vr2         # 0 = 2A+B | ||||
|     vshuf.b   $vr3,$vr4,$vr3,$vr4    # 3 = D | ||||
|     vshuf.b   $vr3,$vr18,$vr3,$vr4    # 3 = D | ||||
|     addi.d    $a7,$a7,16             # next mc | ||||
|     vxor.v    $vr3,$vr3,$vr0         # 3 = 2A+B+D | ||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1    # 0 = 2B+C | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr1    # 0 = 2B+C | ||||
|     andi      $a7,$a7,0x30           # ... mod 4 | ||||
|     addi.d    $t5,$t5,-1             # nr-- | ||||
|     vxor.v    $vr0,$vr0,$vr3         # 0 = 2A+3B+C+D | ||||
|  | @ -112,33 +112,33 @@ _vpaes_encrypt_core: | |||
|     vandn.v   $vr1,$vr1,$vr0        # 1 = i<<4 | ||||
|     vsrli.w   $vr1,$vr1,4           # 1 = i | ||||
|     vand.v    $vr0,$vr0,$vr9        # 0 = k | ||||
|     vshuf.b   $vr5,$vr0,$vr5,$vr0   # 2 = a/k | ||||
|     vshuf.b   $vr5,$vr18,$vr5,$vr0   # 2 = a/k | ||||
|     vori.b    $vr3,$vr10,0          # 3 : 1/i | ||||
|     vxor.v    $vr0,$vr0,$vr1        # 0 = j | ||||
|     vshuf.b   $vr3,$vr1,$vr3,$vr1   # 3 = 1/i | ||||
|     vshuf.b   $vr3,$vr18,$vr3,$vr1   # 3 = 1/i | ||||
|     vori.b    $vr4,$vr10,0          # 4 : 1/j | ||||
|     vxor.v    $vr3,$vr3,$vr5        # 3 = iak = 1/i + a/k | ||||
|     vshuf.b   $vr4,$vr0,$vr4,$vr0   # 4 = 1/j | ||||
|     vshuf.b   $vr4,$vr18,$vr4,$vr0   # 4 = 1/j | ||||
|     vori.b    $vr2,$vr10,0          # 2 : 1/iak | ||||
|     vxor.v    $vr4,$vr4,$vr5        # 4 = jak = 1/j + a/k | ||||
|     vshuf.b   $vr2,$vr3,$vr2,$vr3   # 2 = 1/iak | ||||
|     vshuf.b   $vr2,$vr18,$vr2,$vr3   # 2 = 1/iak | ||||
|     vori.b    $vr3,$vr10,0          # 3 : 1/jak | ||||
|     vxor.v    $vr2,$vr2,$vr0        # 2 = io | ||||
|     vshuf.b   $vr3,$vr4,$vr3,$vr4   # 3 = 1/jak | ||||
|     vld       $vr5,$a5,	0 | ||||
|     vshuf.b   $vr3,$vr18,$vr3,$vr4   # 3 = 1/jak | ||||
|     vld       $vr5,$a5,0 | ||||
|     vxor.v    $vr3,$vr3,$vr1        # 3 = jo | ||||
|     bnez      $t5,.Lenc_loop | ||||
| 
 | ||||
|     # middle of last round | ||||
|     vld       $vr4,$a6,	-0x60		# 3 : sbou	Lk_sbo | ||||
|     vld       $vr0,$a6,	-0x50		# 0 : sbot	Lk_sbo+16 | ||||
|     vshuf.b   $vr4,$vr2,$vr4,$vr2	# 4 = sbou | ||||
|     vshuf.b   $vr4,$vr18,$vr4,$vr2	# 4 = sbou | ||||
|     vxor.v    $vr4,$vr4,$vr5		# 4 = sb1u + k | ||||
|     vshuf.b   $vr0,$vr3,$vr0,$vr3	# 0 = sb1t | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr3	# 0 = sb1t | ||||
|     add.d     $t0,$a7,$a6		# Lk_sr[] | ||||
|     vld       $vr1,$t0,	0x40 | ||||
|     vld       $vr1,$t0,0x40 | ||||
|     vxor.v    $vr0,$vr0,$vr4		# 0 = A | ||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||
|     jr        $ra | ||||
| .cfi_endproc | ||||
| .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core | ||||
|  | @ -163,11 +163,11 @@ _vpaes_decrypt_core: | |||
|     vld       $vr5,$a5,0               # round0 key | ||||
|     slli.d    $a7,$a7,4 | ||||
|     vand.v    $vr0,$vr9,$vr0 | ||||
|     vshuf.b   $vr2,$vr0,$vr2,$vr0 | ||||
|     vshuf.b   $vr2,$vr18,$vr2,$vr0 | ||||
|     vld       $vr0,$t0,16              # ipthi | ||||
|     xori      $a7,$a7,0x30 | ||||
|     la.local  $a6,Lk_dsbd | ||||
|     vshuf.b   $vr0,$vr1,$vr0,$vr1 | ||||
|     vshuf.b   $vr0,$vr18,$vr0,$vr1 | ||||
|     andi      $a7,$a7,0x30 | ||||
|     vxor.v    $vr2,$vr2,$vr5 | ||||
|     la.local  $t0,Lk_mc_forward | ||||
|  | @ -184,29 +184,29 @@ _vpaes_decrypt_core: | |||
| ## | ||||
|     vld        $vr4,$a6,-0x20		# 4 : sb9u | ||||
|     vld        $vr1,$a6,-0x10		# 0 : sb9t | ||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sb9u | ||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sb9t | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sb9u | ||||
|     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sb9t | ||||
|     vxor.v     $vr0,$vr0,$vr4 | ||||
|     vld        $vr4,$a6,0x0		# 4 : sbdu | ||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||
|     vld        $vr1,$a6,0x10		# 0 : sbdt | ||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbdu | ||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbdt | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbdu | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbdt | ||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||
|     vld        $vr4,$a6,0x20		# 4 : sbbu | ||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||
|     vld        $vr1,$a6,0x30		# 0 : sbbt | ||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbbu | ||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbbt | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbbu | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbbt | ||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||
|     vld        $vr4,$a6,0x40		# 4 : sbeu | ||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = ch | ||||
|     vld        $vr1,$a6,0x50		# 0 : sbet | ||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbeu | ||||
|     vshuf.b    $vr0,$vr5,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr3,$vr1,$vr3	# 0 = sbet | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbeu | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr5	# MC ch | ||||
|     vshuf.b    $vr1,$vr18,$vr1,$vr3	# 0 = sbet | ||||
|     vxor.v     $vr0,$vr0,$vr4		# 4 = ch | ||||
|     addi.d     $a5,$a5,	16		# next round key | ||||
|     vbsrl.v    $vr16,$vr5,0xc | ||||
|  | @ -222,32 +222,32 @@ _vpaes_decrypt_core: | |||
|     vori.b     $vr2,$vr11,0		# 2 : a/k | ||||
|     vsrli.w    $vr1,$vr1,4		# 1 = i | ||||
|     vand.v     $vr0,$vr0,$vr9		# 0 = k | ||||
|     vshuf.b    $vr2,$vr0,$vr2,$vr0	# 2 = a/k | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr0	# 2 = a/k | ||||
|     vori.b     $vr3,$vr10,0		# 3 : 1/i | ||||
|     vxor.v     $vr0,$vr0,$vr1		# 0 = j | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1	# 3 = 1/i | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1	# 3 = 1/i | ||||
|     vori.b     $vr4,$vr10,0		# 4 : 1/j | ||||
|     vxor.v     $vr3,$vr3,$vr2		# 3 = iak = 1/i + a/k | ||||
|     vshuf.b    $vr4,$vr0,$vr4,$vr0	# 4 = 1/j | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr0	# 4 = 1/j | ||||
|     vxor.v     $vr4,$vr4,$vr2		# 4 = jak = 1/j + a/k | ||||
|     vori.b     $vr2,$vr10,0		# 2 : 1/iak | ||||
|     vshuf.b    $vr2,$vr3,$vr2,$vr3	# 2 = 1/iak | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr3	# 2 = 1/iak | ||||
|     vori.b     $vr3,$vr10,0		# 3 : 1/jak | ||||
|     vxor.v     $vr2,$vr2,$vr0		# 2 = io | ||||
|     vshuf.b    $vr3,$vr4,$vr3,$vr4	# 3 = 1/jak | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr4	# 3 = 1/jak | ||||
|     vld        $vr0,$a5,0 | ||||
|     vxor.v     $vr3,$vr3,$vr1		# 3 = jo | ||||
|     bnez       $t5,.Ldec_loop | ||||
| 
 | ||||
|     # middle of last round | ||||
|     vld        $vr4,$a6,0x60		# 3 : sbou | ||||
|     vshuf.b    $vr4,$vr2,$vr4,$vr2	# 4 = sbou | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr2	# 4 = sbou | ||||
|     vxor.v     $vr4,$vr4,$vr0		# 4 = sb1u + k | ||||
|     vld        $vr0,$a6,0x70		# 0 : sbot | ||||
|     vld        $vr2,$a7,-0x160		# Lk_sr-.Lk_dsbd=-0x160 | ||||
|     vshuf.b    $vr0,$vr3,$vr0,$vr3	# 0 = sb1t | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr3	# 0 = sb1t | ||||
|     vxor.v     $vr0,$vr0,$vr4		# 0 = A | ||||
|     vshuf.b    $vr0,$vr2,$vr0,$vr2 | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr2 | ||||
|     jr         $ra | ||||
| .cfi_endproc | ||||
| .size	_vpaes_decrypt_core,.-_vpaes_decrypt_core | ||||
|  | @ -292,7 +292,7 @@ _vpaes_schedule_core: | |||
|     # decrypting, output zeroth round key after shiftrows | ||||
|     add.d     $t2,$a4,$a6 | ||||
|     vld       $vr1,$t2,0 | ||||
|     vshuf.b   $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b   $vr3,$vr18,$vr3,$vr1 | ||||
|     vst       $vr3,$a2,0 | ||||
|     xori      $a4,$a4,0x30 | ||||
| 
 | ||||
|  | @ -415,7 +415,7 @@ _vpaes_schedule_core: | |||
|      # encrypting | ||||
|      add.d      $t0,$a4,$a6 | ||||
|      vld        $vr1,$t0,0 | ||||
|      vshuf.b    $vr0,$vr1,$vr0,$vr1             # output permute | ||||
|      vshuf.b    $vr0,$vr18,$vr0,$vr1             # output permute | ||||
|      la.local   $a7,Lk_opt                      # prepare to output transform | ||||
|      addi.d     $a2,$a2,32 | ||||
| 
 | ||||
|  | @ -530,24 +530,24 @@ _vpaes_schedule_low_round: | |||
|     vsrli.w     $vr1,$vr1,0x4			# 1 = i | ||||
|     vand.v      $vr0,$vr0,$vr9			# 0 = k | ||||
|     vaddi.du    $vr2,$vr11,0x0			# 2 : a/k | ||||
|     vshuf.b     $vr2,$vr0,$vr2,$vr0		# 2 = a/k | ||||
|     vshuf.b     $vr2,$vr18,$vr2,$vr0		# 2 = a/k | ||||
|     vxor.v      $vr0,$vr0,$vr1			# 0 = j | ||||
|     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/i | ||||
|     vshuf.b     $vr3,$vr1,$vr3,$vr1		# 3 = 1/i | ||||
|     vshuf.b     $vr3,$vr18,$vr3,$vr1		# 3 = 1/i | ||||
|     vxor.v      $vr3,$vr3,$vr2			# 3 = iak = 1/i + a/k | ||||
|     vaddi.du    $vr4,$vr10,0x0			# 4 : 1/j | ||||
|     vshuf.b     $vr4,$vr0,$vr4,$vr0		# 4 = 1/j | ||||
|     vshuf.b     $vr4,$vr18,$vr4,$vr0		# 4 = 1/j | ||||
|     vxor.v      $vr4,$vr4,$vr2			# 4 = jak = 1/j + a/k | ||||
|     vaddi.du    $vr2,$vr10,0x0			# 2 : 1/iak | ||||
|     vshuf.b     $vr2,$vr3,$vr2,$vr3		# 2 = 1/iak | ||||
|     vshuf.b     $vr2,$vr18,$vr2,$vr3		# 2 = 1/iak | ||||
|     vxor.v      $vr2,$vr2,$vr0			# 2 = io | ||||
|     vaddi.du    $vr3,$vr10,0x0			# 3 : 1/jak | ||||
|     vshuf.b     $vr3,$vr4,$vr3,$vr4		# 3 = 1/jak | ||||
|     vshuf.b     $vr3,$vr18,$vr3,$vr4		# 3 = 1/jak | ||||
|     vxor.v      $vr3,$vr3,$vr1			# 3 = jo | ||||
|     vaddi.du    $vr4,$vr13,0x0			# 4 : sbou | ||||
|     vshuf.b     $vr4,$vr2,$vr4,$vr2		# 4 = sbou | ||||
|     vshuf.b     $vr4,$vr18,$vr4,$vr2		# 4 = sbou | ||||
|     vaddi.du    $vr0,$vr12,0x0			# 0 : sbot | ||||
|     vshuf.b     $vr0,$vr3,$vr0,$vr3		# 0 = sb1t | ||||
|     vshuf.b     $vr0,$vr18,$vr0,$vr3		# 0 = sb1t | ||||
|     vxor.v      $vr0,$vr0,$vr4			# 0 = sbox output | ||||
| 
 | ||||
|     # add in smeared stuff | ||||
|  | @ -575,9 +575,9 @@ _vpaes_schedule_transform: | |||
|     vsrli.w    $vr1,$vr1,4 | ||||
|     vand.v     $vr0,$vr0,$vr9 | ||||
|     vld        $vr2,$a7,0		# lo | ||||
|     vshuf.b    $vr2,$vr0,$vr2,$vr0 | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr0 | ||||
|     vld        $vr0,$a7,16		# hi | ||||
|     vshuf.b    $vr0,$vr1,$vr0,$vr1 | ||||
|     vshuf.b    $vr0,$vr18,$vr0,$vr1 | ||||
|     vxor.v     $vr0,$vr0,$vr2 | ||||
|     jr         $ra | ||||
| .cfi_endproc | ||||
|  | @ -620,11 +620,11 @@ _vpaes_schedule_mangle: | |||
|     la.local   $t0,Lk_s63 | ||||
|     vld        $vr16,$t0,0 | ||||
|     vxor.v     $vr4,$vr4,$vr16 | ||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||
|     vori.b     $vr3,$vr4,0 | ||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||
|     vxor.v     $vr3,$vr3,$vr4 | ||||
|     vshuf.b    $vr4,$vr5,$vr4,$vr5 | ||||
|     vshuf.b    $vr4,$vr18,$vr4,$vr5 | ||||
|     vxor.v     $vr3,$vr3,$vr4 | ||||
| 
 | ||||
|     b          .Lschedule_mangle_both | ||||
|  | @ -638,33 +638,33 @@ _vpaes_schedule_mangle: | |||
|     vand.v     $vr4,$vr4,$vr9		# 4 = lo | ||||
| 
 | ||||
|     vld        $vr2,$a7,0 | ||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||
|     vld        $vr3,$a7,0x10 | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||
|     vxor.v     $vr3,$vr3,$vr2 | ||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||
| 
 | ||||
|     vld        $vr2,$a7,0x20 | ||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||
|     vxor.v     $vr2,$vr2,$vr3 | ||||
|     vld        $vr3,$a7,0x30 | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||
|     vxor.v     $vr3,$vr3,$vr2 | ||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||
| 
 | ||||
|     vld        $vr2,$a7,0x40 | ||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||
|     vxor.v     $vr2,$vr2,$vr3 | ||||
|     vld        $vr3,$a7,0x50 | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||
|     vxor.v     $vr3,$vr3,$vr2 | ||||
|     vshuf.b    $vr3,$vr5,$vr3,$vr5 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr5 | ||||
| 
 | ||||
|     vld        $vr2,$a7,0x60 | ||||
|     vshuf.b    $vr2,$vr4,$vr2,$vr4 | ||||
|     vshuf.b    $vr2,$vr18,$vr2,$vr4 | ||||
|     vxor.v     $vr2,$vr2,$vr3 | ||||
|     vld        $vr3,$a7,0x70 | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||
|     vxor.v     $vr3,$vr3,$vr2 | ||||
| 
 | ||||
|     addi.d     $a2,$a2,-16 | ||||
|  | @ -672,7 +672,7 @@ _vpaes_schedule_mangle: | |||
| .Lschedule_mangle_both: | ||||
|     add.d      $t2,$a4,$a6 | ||||
|     vld        $vr1,$t2,0 | ||||
|     vshuf.b    $vr3,$vr1,$vr3,$vr1 | ||||
|     vshuf.b    $vr3,$vr18,$vr3,$vr1 | ||||
|     addi.d     $a4,$a4,-16 | ||||
|     andi       $a4,$a4,0x30 | ||||
|     vst        $vr3,$a2,0 | ||||
|  | @ -885,6 +885,7 @@ _vpaes_preheat: | |||
|     vld       $vr12,$a6,0x40		# Lk_sb1+16 | ||||
|     vld       $vr15,$a6,0x50		# Lk_sb2 | ||||
|     vld       $vr14,$a6,0x60		# Lk_sb2+16 | ||||
|     vldi      $vr18,0                   # $vr18 in this program is equal to 0 | ||||
|     jirl      $zero,$ra,0 | ||||
| .cfi_endproc | ||||
| .size	_vpaes_preheat,.-_vpaes_preheat | ||||
|  | @ -899,8 +900,8 @@ $code.=<<___; | |||
| .section .rodata | ||||
| .align	6 | ||||
| Lk_inv:	# inv, inva | ||||
|     .quad	0x0E05060F0D080180, 0x040703090A0B0C02 | ||||
|     .quad	0x01040A060F0B0780, 0x030D0E0C02050809 | ||||
|     .quad	0x0E05060F0D080110, 0x040703090A0B0C02 | ||||
|     .quad	0x01040A060F0B0710, 0x030D0E0C02050809 | ||||
| 
 | ||||
| Lk_s0F:	# s0F | ||||
|     .quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue