openssl/crypto/aes/asm/aes-sha512-armv8.pl

2967 lines
82 KiB
Perl

#! /usr/bin/env perl
# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$code=<<___;
#include "arm_arch.h"
/* These are offsets into the CIPH_DIGEST struct */
#define CIPHER_KEY 0
#define CIPHER_KEY_ROUNDS 8
#define CIPHER_IV 16
#define HMAC_IKEYPAD 24
#define HMAC_OKEYPAD 32
.text
.arch armv8-a+crypto
___
sub aes_block_9_rounds() {
my $i = shift;
$code.=<<___;
/* aes block $i */
aese v$i.16b, v8.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v9.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v10.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v11.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v12.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v13.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v14.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v15.16b
aesmc v$i.16b, v$i.16b
aese v$i.16b, v16.16b
aesmc v$i.16b, v$i.16b
___
}
sub aes_block_last_rounds () {
my $compare = shift;
my $label = shift;
my $i = shift;
my $load_rk10 = shift;
if($compare == 1) {
$code.=<<___;
cmp x9, #12 /* tell 128,192,256 apart */
___
}
$code.=<<___;
b.lt .Laes128_${label}_$i
.Laes192_${label}_$i:
ldp q18,q19,[x7],32 /* rk[10],rk[11] */
aese v$i.16b,v17.16b
aesmc v$i.16b,v$i.16b
aese v$i.16b,v18.16b
aesmc v$i.16b,v$i.16b
b.gt .Laes256_${label}_$i
ld1 {v18.16b},[x7] /* rk[12] */
aese v$i.16b,v19.16b
eor v$i.16b,v$i.16b,v18.16b
sub x7, x7, #32 /* rewind x7 */
b 1f
.Laes256_${label}_$i:
aese v$i.16b,v19.16b
aesmc v$i.16b,v$i.16b
ldp q18,q19,[x7],32 /* rk[12],rk[13] */
aese v$i.16b,v18.16b
aesmc v$i.16b,v$i.16b
ld1 {v18.16b},[x7] /* rk[14] */
aese v$i.16b,v19.16b
eor v$i.16b,v$i.16b,v18.16b
sub x7, x7, #64 /* rewind x7 */
b 1f
.Laes128_${label}_$i:
___
if ($load_rk10 == 1) {
$code.=<<___;
ld1 {v18.16b},[x7]
___
}
$code.=<<___;
aese v$i.16b,v17.16b
eor v$i.16b,v$i.16b,v18.16b /* res */
1:
___
}
sub aes_block_dec_9_rounds() {
my $i = shift;
$code.=<<___;
/* aes block $i */
aesd v$i.16b, v8.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v9.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v10.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v11.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v12.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v13.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v14.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v15.16b
aesimc v$i.16b, v$i.16b
aesd v$i.16b, v16.16b
aesimc v$i.16b, v$i.16b
___
}
sub aes_block_dec_last_rounds () {
my $compare = shift;
my $label = shift;
my $i = shift;
my $load_rk10 = shift;
if($compare == 1) {
$code.=<<___;
cmp x9, #12 /* tell 128,192,256 apart */
___
}
$code.=<<___;
b.lt .Laes128_${label}_$i
.Laes192_${label}_$i:
ldp q18,q19,[x7],32 /* rk[10],rk[11] */
aesd v$i.16b,v17.16b
aesimc v$i.16b,v$i.16b
aesd v$i.16b,v18.16b
aesimc v$i.16b,v$i.16b
b.gt .Laes256_${label}_$i
ld1 {v18.16b},[x7] /* rk[12] */
aesd v$i.16b,v19.16b
eor v$i.16b,v$i.16b,v18.16b
sub x7, x7, #32 /* rewind x7 */
b 1f
.Laes256_${label}_$i:
aesd v$i.16b,v19.16b
aesimc v$i.16b,v$i.16b
ldp q18,q19,[x7],32 /* rk[12],rk[13] */
aesd v$i.16b,v18.16b
aesimc v$i.16b,v$i.16b
ld1 {v18.16b},[x7] /* rk[14] */
aesd v$i.16b,v19.16b
eor v$i.16b,v$i.16b,v18.16b
sub x7, x7, #64 /* rewind x7 */
b 1f
.Laes128_${label}_$i:
___
if ($load_rk10 == 1) {
$code.=<<___;
ld1 {v18.16b},[x7]
___
}
$code.=<<___;
aesd v$i.16b,v17.16b
eor v$i.16b,v$i.16b,v18.16b /* res */
1:
___
}
sub sha512_block() {
my @H = map("v$_",(24..28));
my @QH = map("q$_",(24..28));
my ($FG, $DE) = map("v$_",(29..30));
my ($QFG, $QDE) = map("q$_",(29..30));
my $M9_10 = "v31";
my @MSG = map("v$_", (0..7));
my ($W0, $W1) = ("v8", "v9");
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
my $need_revert = shift;
if($need_revert == 1) {
$code.=<<___;
rev64 @MSG[0].16b, @MSG[0].16b
rev64 @MSG[1].16b, @MSG[1].16b
rev64 @MSG[2].16b, @MSG[2].16b
rev64 @MSG[3].16b, @MSG[3].16b
rev64 @MSG[4].16b, @MSG[4].16b
rev64 @MSG[5].16b, @MSG[5].16b
rev64 @MSG[6].16b, @MSG[6].16b
rev64 @MSG[7].16b, @MSG[7].16b
___
}
$code.=<<___;
/* load const k */
ld1 {$W0.2d}, [x10], #16
/* backup ABCDEFGH */
mov $AB.16b, @H[0].16b
mov $CD.16b, @H[1].16b
mov $EF.16b, @H[2].16b
mov $GH.16b, @H[3].16b
___
for($i = 0; $i < 32; $i++) {
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16)*/
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
}
for(;$i<40;$i++) {
$code.=<<___ if ($i<39);
ld1 {$W1.2d},[x10],#16
___
$code.=<<___ if ($i==39);
sub x10, x10, #80*8 // rewind
___
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
}
$code.=<<___;
add @H[0].2d, @H[0].2d, $AB.2d
add @H[1].2d, @H[1].2d, $CD.2d
add @H[2].2d, @H[2].2d, $EF.2d
add @H[3].2d, @H[3].2d, $GH.2d
___
}
{
my @H = map("v$_",(24..28));
my @QH = map("q$_",(24..28));
my ($FG, $DE) = map("v$_",(29..30));
my ($QFG, $QDE) = map("q$_",(29..30));
my $M9_10 = "v31";
my @MSG = map("v$_", (0..7));
my ($W0, $W1) = ("v14", "v15");
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
$code.=<<___;
/*
* asm_aescbc_sha512_hmac(
* csrc, x0 (cipher src address)
* cdst, x1 (cipher dst address)
* clen x2 (cipher length)
* dsrc, x3 (digest src address)
* ddst, x4 (digest dst address)
* dlen, x5 (digest length)
* arg x6 :
* arg->cipher.key (round keys)
* arg->cipher.key_rounds (key rounds)
* arg->cipher.iv (initialization vector)
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
* )
*/
.global asm_aescbc_sha512_hmac
.type asm_aescbc_sha512_hmac,%function
.align 6
.LK512:
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538,0x59f111f1b605d019
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242,0x12835b0145706fbe
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
.quad 0x06ca6351e003826f,0x142929670a0e6e70
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6,0x92722c851482353b
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
.quad 0xd192e819d6ef5218,0xd69906245565a910
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
.quad 0x90befffa23631e28,0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
.quad 0xca273eceea26619c,0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae,0x1b710b35131c471b
.quad 0x28db77f523047d84,0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
.quad 0 // terminator
.align 4
asm_aescbc_sha512_hmac:
AARCH64_VALID_CALL_TARGET
/* save callee save register */
stp d8, d9, [sp,#-64]!
stp d10, d11, [sp,#16]
stp d12, d13, [sp,#32]
stp d14, d15, [sp,#48]
/* load ABCDEFGH */
ldr x7, [x6, #HMAC_IKEYPAD]
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
ldr x7, [x6, #CIPHER_KEY]
ldr x8, [x6, #CIPHER_IV]
ldr x9, [x6, #CIPHER_KEY_ROUNDS]
mov x12, x7 /* backup x7 */
adr x10, .LK512
lsr x11, x2, #4 /* aes_block = len/16 */
cbz x11, .Lret /* return if aes_block = 0 */
cmp x11, #16
b.lt .Lenc_short_case
ld1 {v0.16b}, [x0], #16 /* load plaintext */
ld1 {v1.16b}, [x8] /* load iv */
eor v0.16b, v0.16b, v1.16b /* iv xor plaintext */
ldp q8, q9, [x7], #32 /* rk0, rk1 */
/* block 0 */
aese v0.16b, v8.16b
aesmc v0.16b, v0.16b
ldp q10, q11, [x7], #32 /* rk2, rk3 */
aese v0.16b, v9.16b
aesmc v0.16b, v0.16b
aese v0.16b, v10.16b
aesmc v0.16b, v0.16b
ldp q12, q13, [x7], #32 /* rk4, rk5 */
aese v0.16b, v11.16b
aesmc v0.16b, v0.16b
aese v0.16b, v12.16b
aesmc v0.16b, v0.16b
ldp q14, q15, [x7], #32 /* rk6, rk7 */
aese v0.16b, v13.16b
aesmc v0.16b, v0.16b
aese v0.16b, v14.16b
aesmc v0.16b, v0.16b
ldp q16, q17, [x7], #32 /* rk8, rk9 */
aese v0.16b, v15.16b
aesmc v0.16b, v0.16b
aese v0.16b, v16.16b
aesmc v0.16b, v0.16b
ld1 {v18.16b}, [x7] /* rk10 */
___
&aes_block_last_rounds(1, "enc_prelog", 0, 0);
$code.=<<___;
str q0, [x1], #16 /* store cipher result */
ld1 {v1.16b}, [x0], #16 /* load next block */
eor v1.16b, v1.16b, v0.16b /* output xor block */
___
# process aes blocks from 1 to 7
for($i = 1; $i < 8; $i = $i + 1) {
&aes_block_9_rounds($i);
&aes_block_last_rounds(0, "enc_prelog", $i, 0);
if($i != 7) {
$next = $i + 1;
$code.=<<___;
/* load next block */
ld1 {v$next.16b}, [x0], #16
/* output xor block */
eor v$next.16b, v$next.16b, v$i.16b
___
}
$code.=<<___;
str q$i, [x1], #16 /* store cipher result */
___
}
$code.=<<___;
sub x11, x11, #8
.Lenc_main_loop:
mov x7, x12
mov x14, x1
/* aes block 0 */
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v12.16b}, [x0], #16
eor v12.16b, v12.16b, v7.16b
/* reverse message */
rev64 @MSG[0].16b, @MSG[0].16b
rev64 @MSG[1].16b, @MSG[1].16b
rev64 @MSG[2].16b, @MSG[2].16b
rev64 @MSG[3].16b, @MSG[3].16b
rev64 @MSG[4].16b, @MSG[4].16b
rev64 @MSG[5].16b, @MSG[5].16b
rev64 @MSG[6].16b, @MSG[6].16b
rev64 @MSG[7].16b, @MSG[7].16b
ld1 {$W0.2d}, [x10], #16 /* load const k*/
/* backup ABCDEFGH */
mov $AB.16b, @H[0].16b
mov $CD.16b, @H[1].16b
mov $EF.16b, @H[2].16b
mov $GH.16b, @H[3].16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
ldp q10, q11, [x7], #32 /* rk6, rk7 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
ldp q8, q9, [x7], #32 /* rk8, rk9 */
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
cmp x9, #12
b.lt .Lenc_main_loop_aes128_0
.Lenc_main_loop_aes192_0:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_0
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_0:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_0:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 1 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
ldp q8, q9, [x7], #32 /* rk4, rk5 */
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
ldp q8, q9, [x7], #32 /* rk8, rk9 */
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
cmp x9, #12
b.lt .Lenc_main_loop_aes128_1
.Lenc_main_loop_aes192_1:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_1
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_1:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_1:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 2 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
ldp q8, q9, [x7], #32 /* rk4, rk5 */
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
ldp q10, q11, [x7], #32 /* rk6, rk7 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
cmp x9, #12
b.lt .Lenc_main_loop_aes128_2
.Lenc_main_loop_aes192_2:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_2
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_2:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_2:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 3 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
ldp q8, q9, [x7], #32 /* rk4, rk5 */
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
cmp x9, #12
b.lt .Lenc_main_loop_aes128_3
.Lenc_main_loop_aes192_3:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_3
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_3:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_3:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 4 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
cmp x9, #12
b.lt .Lenc_main_loop_aes128_4
.Lenc_main_loop_aes192_4:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_4
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_4:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_4:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 5 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
cmp x9, #12
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
b.lt .Lenc_main_loop_aes128_5
.Lenc_main_loop_aes192_5:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_5
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_5:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_5:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 6 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ld1 {$W1.2d}, [x10], #16 /* load const k*/
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
sha512su0 @MSG[0].2d, @MSG[1].2d
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
cmp x9, #12
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
b.lt .Lenc_main_loop_aes128_6
.Lenc_main_loop_aes192_6:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_6
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_6:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_6:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
st1 {v12.16b}, [x1], #16
/* aes block 7 */
mov x7, x12
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ld1 {v13.16b}, [x0], #16
eor v12.16b, v12.16b, v13.16b
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
ld1 {$W1.2d},[x10],#16
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
sub x10, x10, #80*8 // rewind
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
ext $W0.16b, $W0.16b, $W0.16b, #8
ext $FG.16b, @H[2].16b, @H[3].16b, #8
ext $DE.16b, @H[1].16b, @H[2].16b, #8
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
/* T1 = h + Kt + Wt*/
add @H[3].2d, @H[3].2d, $W0.2d
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
sha512h @QH[3], $QFG, $DE.2d
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
cmp x9, #12
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
sha512h2 @QH[3], @QH[1], @H[0].2d
___
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
$code.=<<___;
b.lt .Lenc_main_loop_aes128_7
.Lenc_main_loop_aes192_7:
ldp q10, q11, [x7], #32 /* rk10, rk11 */
aese v12.16b, v9.16b
aesmc v12.16b, v12.16b
aese v12.16b, v10.16b
aesmc v12.16b, v12.16b
b.gt .Lenc_main_loop_aes256_7
ld1 {v8.16b},[x7] /* rk12 */
aese v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
b 1f
.Lenc_main_loop_aes256_7:
ldp q8, q9, [x7], #32 /* rk12, rk13 */
aese v12.16b, v11.16b
aesmc v12.16b, v12.16b
ld1 {v10.16b},[x7] /* rk14 */
aese v12.16b, v8.16b
aesmc v12.16b, v12.16b
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
b 1f
.Lenc_main_loop_aes128_7:
ld1 {v10.16b},[x7] /* rk10 */
aese v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
1:
add @H[0].2d, @H[0].2d, $AB.2d
add @H[1].2d, @H[1].2d, $CD.2d
add @H[2].2d, @H[2].2d, $EF.2d
add @H[3].2d, @H[3].2d, $GH.2d
st1 {v12.16b}, [x1], #16
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x14], #64
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x14]
sub x11, x11, #8
cmp x11, #8
b.ge .Lenc_main_loop
/* epilog - process sha block */
___
&sha512_block(1);
$code.=<<___;
mov x7, x12
ld1 {v0.16b}, [x0], #16 /* load plaintext */
ldr q1, [x14, #48] /* load the last output of aes block */
eor v0.16b, v0.16b, v1.16b
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ldp q12, q13, [x7], #32 /* rk4, rk5 */
ldp q14, q15, [x7], #32 /* rk6, rk7 */
ldp q16, q17, [x7], #32 /* rk8, rk9 */
ld1 {v18.16b}, [x7] /* rk10 */
mov w12, #0x80 /* sha padding 0b10000000 */
b .Lenc_less_than_8_block
/* aes_block < 16 */
.Lenc_short_case:
ld1 {v0.16b}, [x0], #16 /* load plaintext */
ld1 {v1.16b}, [x8] /* load iv */
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ldp q12, q13, [x7], #32 /* rk4, rk5 */
ldp q14, q15, [x7], #32 /* rk6, rk7 */
ldp q16, q17, [x7], #32 /* rk8, rk9 */
ld1 {v18.16b}, [x7] /* rk10 */
mov w12, #0x80 /* sha padding 0b10000000 */
eor v0.16b, v0.16b, v1.16b /* iv xor plaintext */
cmp x11, #8
b.lt .Lenc_less_than_8_block
___
# process 8 aes blocks
for($i = 0; $i < 8; $i = $i + 1) {
&aes_block_9_rounds($i);
# only tell 128/192/256 at the first time
&aes_block_last_rounds(($i == 0)?1:0, "enc_short", $i, 0);
if($i != 7) {
$next = $i + 1;
$code.=<<___;
/* load next block */
ld1 {v$next.16b}, [x0], #16
/* output xor block */
eor v$next.16b, v$next.16b, v$i.16b
___
}
}
$code.=<<___;
/* store 8 blocks of ciphertext */
stp q0, q1, [x1], #32
stp q2, q3, [x1], #32
stp q4, q5, [x1], #32
stp q6, q7, [x1], #32
sub x11, x11, #8
___
# now we have a whole sha512 block
&sha512_block(1);
$code.=<<___;
ldr x7, [x6, #CIPHER_KEY]
ldp q8, q9, [x7] /* restore clobbered rk0, rk1 */
add x7, x7, #160 /* x7 point to rk10 */
cbz x11, .Lenc_short_no_more_aes_block
ld1 {v0.16b}, [x0], #16 /* load plaintext */
ldr q1, [x1, -16]
eor v0.16b, v0.16b, v1.16b
.Lenc_less_than_8_block:
cbz x11, .Lenc_short_no_more_aes_block
___
# process remained aes blocks (<= 7)
for($i = 0; $i < 7; $i = $i + 1) {
&aes_block_9_rounds($i);
&aes_block_last_rounds(($i == 0)?1:0, "enc_short_partial", $i, 0);
$code.=<<___;
str q$i, [x1], #16
sub x11, x11, #1
cbz x11, .Lenc_short_post_Q$i
___
if($i != 6) {
$next = $i + 1;
$code.=<<___;
/* load next block*/
ld1 {v$next.16b}, [x0], #16
/* output xor block */
eor v$next.16b, v$next.16b, v$i.16b
___
}
}
$code.=<<___;
.Lenc_short_no_more_aes_block:
eor v0.16b, v0.16b, v0.16b
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v0.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q0:
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v1.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q1:
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v2.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q2:
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v3.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q3:
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v4.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q4:
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v5.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q5:
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v6.b[0], w12
b .Lenc_short_post_sha
.Lenc_short_post_Q6:
eor v7.16b, v7.16b, v7.16b
mov v7.b[0], w12
/* we have one padded sha512 block now, process it and
then employ another one to host sha length */
___
&sha512_block(1);
$code.=<<___;
eor v0.16b, v0.16b, v0.16b
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
.Lenc_short_post_sha:
/* we have last padded sha512 block now */
eor x13, x13, x13 /* length_lo */
eor x14, x14, x14 /* length_hi */
adds x13, x13, x2, lsl #3 /* add len in bits */
lsr x15, x2, #61
adc x14, x14, x15
adds x13, x13, #1024 /* add i_key_pad 1024 bits */
adc x14, x14, xzr
mov v7.d[0], x14
mov v7.d[1], x13
rev64 v7.16b, v7.16b
___
&sha512_block(1);
$code.=<<___;
/* Final HMAC - opad part */
mov v0.16b, v24.16b
mov v1.16b, v25.16b
mov v2.16b, v26.16b
mov v3.16b, v27.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v4.b[7], w12 /* padding 1 */
mov x13, #1024+512 /* length in bits */
mov v7.d[1], x13
/* load ABCDEFGH for opad */
ldr x7, [x6, #HMAC_OKEYPAD]
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
___
&sha512_block(0);
$code.=<<___;
.Lret:
mov x0, xzr /* return 0 */
rev64 v24.16b, v24.16b
rev64 v25.16b, v25.16b
rev64 v26.16b, v26.16b
rev64 v27.16b, v27.16b
/* store hash result */
st1 {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
/* restore callee save register */
ldp d10, d11, [sp,#16]
ldp d12, d13, [sp,#32]
ldp d14, d15, [sp,#48]
ldp d8, d9, [sp], #64
ret
.size asm_aescbc_sha512_hmac, .-asm_aescbc_sha512_hmac
___
}
{
my @H = map("v$_",(24..28));
my @QH = map("q$_",(24..28));
my ($FG, $DE) = map("v$_",(29..30));
my ($QFG, $QDE) = map("q$_",(29..30));
my $M9_10 = "v31";
my @MSG = map("v$_", (0..7));
my ($W0, $W1) = ("v14", "v15");
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
$code.=<<___;
/*
* asm_sha512_hmac_aescbc_dec(
* csrc, x0 (cipher src address)
* cdst, x1 (cipher dst address)
* clen x2 (cipher length)
* dsrc, x3 (digest src address)
* ddst, x4 (digest dst address)
* dlen, x5 (digest length)
* arg x6 :
* arg->cipher.key (round keys)
* arg->cipher.key_rounds (key rounds)
* arg->cipher.iv (initialization vector)
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
* )
*/
.global asm_sha512_hmac_aescbc_dec
.type asm_sha512_hmac_aescbc_dec,%function
.align 4
asm_sha512_hmac_aescbc_dec:
AARCH64_VALID_CALL_TARGET
/* save callee save register */
stp d8, d9, [sp,#-64]!
stp d10, d11, [sp,#16]
stp d12, d13, [sp,#32]
stp d14, d15, [sp,#48]
/* load ABCDEFGH */
ldr x7, [x6, #HMAC_IKEYPAD]
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
ldr x7, [x6, #CIPHER_KEY]
ldr x8, [x6, #CIPHER_IV]
ldr x9, [x6, #CIPHER_KEY_ROUNDS]
mov x12, x7 /* backup x7 */
adr x10, .LK512
lsr x11, x2, #4 /* aes_block = len/16 */
cbz x11, .Ldec_ret /* return if aes_block = 0 */
ld1 {v20.16b}, [x8] /* load iv */
cmp x11, #8
b.lt .Ldec_short_case
.Ldec_main_loop:
ldp q12, q13, [x0], #32
ldp q14, q15, [x0], #32
ldp q16, q17, [x0], #32
ldp q18, q19, [x0], #32
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
mov v0.16b, v12.16b
mov v1.16b, v13.16b
mov v2.16b, v14.16b
mov v3.16b, v15.16b
mov v4.16b, v16.16b
mov v5.16b, v17.16b
mov v6.16b, v18.16b
mov v7.16b, v19.16b
/* 1 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v8.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v8.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v8.16b
aesimc v19.16b, v19.16b
/* 2 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v9.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v9.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v9.16b
aesimc v19.16b, v19.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* 3 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v10.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v10.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v10.16b
aesimc v19.16b, v19.16b
/* 4 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v11.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v11.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v11.16b
aesimc v19.16b, v19.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* 5 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v8.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v8.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v8.16b
aesimc v19.16b, v19.16b
/* 6 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v9.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v9.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v9.16b
aesimc v19.16b, v19.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* 7 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v10.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v10.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v10.16b
aesimc v19.16b, v19.16b
/* 8 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v11.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v11.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v11.16b
aesimc v19.16b, v19.16b
/* 9 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v8.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v8.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v8.16b
aesimc v19.16b, v19.16b
cmp x9, #12 /* tell 128,192,256 apart */
b.lt .Laes128_dec_main
.Laes192_dec_main:
ldp q10,q11,[x7],32 /* rk10,rk11 */
/* 10 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v9.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v9.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v9.16b
aesimc v19.16b, v19.16b
/* 11 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v10.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v10.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v10.16b
aesimc v19.16b, v19.16b
b.gt .Laes256_dec_main
ld1 {v8.16b},[x7] /* rk12 */
/*12 round */
aesd v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
aesd v13.16b, v11.16b
eor v13.16b, v13.16b, v8.16b
aesd v14.16b, v11.16b
eor v14.16b, v14.16b, v8.16b
aesd v15.16b, v11.16b
eor v15.16b, v15.16b, v8.16b
aesd v16.16b, v11.16b
eor v16.16b, v16.16b, v8.16b
aesd v17.16b, v11.16b
eor v17.16b, v17.16b, v8.16b
aesd v18.16b, v11.16b
eor v18.16b, v18.16b, v8.16b
aesd v19.16b, v11.16b
eor v19.16b, v19.16b, v8.16b
sub x7, x7, #192 /* rewind x7 */
b 1f
.Laes256_dec_main:
ldp q8,q9,[x7],32 /* rk12,rk13 */
/* 12 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v11.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v11.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v11.16b
aesimc v19.16b, v19.16b
/* 13 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v17.16b, v8.16b
aesimc v17.16b, v17.16b
aesd v18.16b, v8.16b
aesimc v18.16b, v18.16b
aesd v19.16b, v8.16b
aesimc v19.16b, v19.16b
ld1 {v10.16b},[x7] /* rk14 */
/* 14 round */
aesd v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
aesd v13.16b, v9.16b
eor v13.16b, v13.16b, v10.16b
aesd v14.16b, v9.16b
eor v14.16b, v14.16b, v10.16b
aesd v15.16b, v9.16b
eor v15.16b, v15.16b, v10.16b
aesd v16.16b, v9.16b
eor v16.16b, v16.16b, v10.16b
aesd v17.16b, v9.16b
eor v17.16b, v17.16b, v10.16b
aesd v18.16b, v9.16b
eor v18.16b, v18.16b, v10.16b
aesd v19.16b, v9.16b
eor v19.16b, v19.16b, v10.16b
sub x7, x7, #224
b 1f
.Laes128_dec_main:
ld1 {v10.16b},[x7] /* rk10 */
aesd v12.16b,v9.16b
eor v12.16b, v12.16b, v10.16b
aesd v13.16b,v9.16b
eor v13.16b, v13.16b, v10.16b
aesd v14.16b,v9.16b
eor v14.16b, v14.16b, v10.16b
aesd v15.16b,v9.16b
eor v15.16b, v15.16b, v10.16b
aesd v16.16b,v9.16b
eor v16.16b, v16.16b, v10.16b
aesd v17.16b,v9.16b
eor v17.16b, v17.16b, v10.16b
aesd v18.16b,v9.16b
eor v18.16b, v18.16b, v10.16b
aesd v19.16b,v9.16b
eor v19.16b, v19.16b, v10.16b
sub x7, x7, #160
1:
eor v12.16b, v12.16b, v20.16b
eor v13.16b, v13.16b, v0.16b
eor v14.16b, v14.16b, v1.16b
eor v15.16b, v15.16b, v2.16b
eor v16.16b, v16.16b, v3.16b
eor v17.16b, v17.16b, v4.16b
eor v18.16b, v18.16b, v5.16b
eor v19.16b, v19.16b, v6.16b
stp q12,q13, [x1], #32
ldr q12, [x0, #-16] /* load last cipher */
stp q14,q15, [x1], #32
stp q16,q17, [x1], #32
stp q18,q19, [x1], #32
___
&sha512_block(1);
$code.=<<___;
mov v20.16b, v12.16b /* load last cipher */
sub x11, x11, #8
cmp x11, #8
b.ge .Ldec_main_loop
/* aes_block < 8 */
.Ldec_short_case:
mov w12, #0x80 /* sha padding 0b10000000 */
cbnz x11, 1f
eor v0.16b, v0.16b, v0.16b
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v0.b[0], w12
b .Ldec_short_post_sha
1:
cmp x11, #4
b.lt .Ldec_less_than_4_block
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
ldp q12, q13, [x0], #32
ldp q14, q15, [x0], #32
mov v0.16b, v12.16b
mov v1.16b, v13.16b
mov v2.16b, v14.16b
mov v3.16b, v15.16b
/* 1 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
/* 2 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
/* 3 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
/* 4 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
/* 5 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
/* 6 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
/* 7 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
/* 8 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
/* 9 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
cmp x9, #12 /* tell 128,192,256 apart */
b.lt .Laes128_dec_short
.Laes192_dec_short:
ldp q10,q11,[x7],32 /* rk10,rk11 */
/* 10 round */
aesd v12.16b, v9.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v9.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v9.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v9.16b
aesimc v15.16b, v15.16b
/* 11 round */
aesd v12.16b, v10.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v10.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v10.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v10.16b
aesimc v15.16b, v15.16b
b.gt .Laes256_dec_short
ld1 {v8.16b},[x7] /* rk12 */
/*12 round */
aesd v12.16b, v11.16b
eor v12.16b, v12.16b, v8.16b
aesd v13.16b, v11.16b
eor v13.16b, v13.16b, v8.16b
aesd v14.16b, v11.16b
eor v14.16b, v14.16b, v8.16b
aesd v15.16b, v11.16b
eor v15.16b, v15.16b, v8.16b
sub x7, x7, #192 /* rewind x7 */
b 1f
.Laes256_dec_short:
ldp q8,q9,[x7],32 /* rk12,rk13 */
/* 12 round */
aesd v12.16b, v11.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v11.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v11.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v11.16b
aesimc v15.16b, v15.16b
/* 13 round */
aesd v12.16b, v8.16b
aesimc v12.16b, v12.16b
aesd v13.16b, v8.16b
aesimc v13.16b, v13.16b
aesd v14.16b, v8.16b
aesimc v14.16b, v14.16b
aesd v15.16b, v8.16b
aesimc v15.16b, v15.16b
ld1 {v10.16b},[x7] /* rk14 */
/* 14 round */
aesd v12.16b, v9.16b
eor v12.16b, v12.16b, v10.16b
aesd v13.16b, v9.16b
eor v13.16b, v13.16b, v10.16b
aesd v14.16b, v9.16b
eor v14.16b, v14.16b, v10.16b
aesd v15.16b, v9.16b
eor v15.16b, v15.16b, v10.16b
sub x7, x7, #224
b 1f
.Laes128_dec_short:
ld1 {v10.16b},[x7] /* rk10 */
aesd v12.16b,v9.16b
eor v12.16b, v12.16b, v10.16b
aesd v13.16b,v9.16b
eor v13.16b, v13.16b, v10.16b
aesd v14.16b,v9.16b
eor v14.16b, v14.16b, v10.16b
aesd v15.16b,v9.16b
eor v15.16b, v15.16b, v10.16b
sub x7, x7, #160
1:
eor v12.16b, v12.16b, v20.16b
eor v13.16b, v13.16b, v0.16b
eor v14.16b, v14.16b, v1.16b
eor v15.16b, v15.16b, v2.16b
ldr q20, [x0, #-16]
sub x11, x11, #4
stp q12,q13, [x1], #32
stp q14,q15, [x1], #32
cbz x11, .Ldec_short_post_Q3
___
for($i = 0; $i < 3; $i = $i + 1) {
$block = $i + 4;
$code.=<<___;
ld1 {v16.16b}, [x0], #16
mov v$block.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
cmp x9, #12 /* tell 128,192,256 apart */
b.lt .Laes128_dec_short_$block
.Laes192_dec_short_$block:
ldp q10,q11,[x7],32 /* rk10,rk11 */
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
b.gt .Laes256_dec_short_$block
ld1 {v8.16b},[x7] /* rk12 */
aesd v16.16b, v11.16b
eor v16.16b, v16.16b, v8.16b
sub x7, x7, #192 /* rewind x7 */
b 1f
.Laes256_dec_short_$block:
ldp q8,q9,[x7],32 /* rk12,rk13 */
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
ld1 {v10.16b},[x7] /* rk14 */
aesd v16.16b, v9.16b
eor v16.16b, v16.16b, v10.16b
sub x7, x7, #224
b 1f
.Laes128_dec_short_$block:
ld1 {v10.16b},[x7] /* rk10 */
aesd v16.16b,v9.16b
eor v16.16b, v16.16b, v10.16b
sub x7, x7, #160
1:
sub x11, x11, 1
eor v16.16b, v16.16b, v20.16b
ldr q20, [x0, #-16]
st1 {v16.16b}, [x1], #16
cbz x11, .Ldec_short_post_Q$block
___
}
$code.=<<___;
.Ldec_short_post_Q3:
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v4.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_Q4:
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v5.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_Q5:
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v6.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_Q6:
eor v7.16b, v7.16b, v7.16b
mov v7.b[0], w12
/* we have one padded sha512 block now, process it and
then employ another one to host sha length */
___
&sha512_block(1);
$code.=<<___;
eor v0.16b, v0.16b, v0.16b
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
b .Ldec_short_post_sha
.Ldec_less_than_4_block:
___
for($i = 0; $i < 3; $i = $i + 1) {
$code.=<<___;
ld1 {v16.16b}, [x0], #16
mov v$i.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk0, rk1 */
ldp q10, q11, [x7], #32 /* rk2, rk3 */
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk4, rk5 */
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
ldp q10, q11, [x7], #32 /* rk6, rk7 */
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
ldp q8, q9, [x7], #32 /* rk8, rk9 */
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
cmp x9, #12 /* tell 128,192,256 apart */
b.lt .Laes128_dec_short_less_than_4_$i
.Laes192_dec_short_less_than_4_$i:
ldp q10,q11,[x7],32 /* rk10,rk11 */
aesd v16.16b, v9.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v10.16b
aesimc v16.16b, v16.16b
b.gt .Laes256_dec_short_less_than_4_$i
ld1 {v8.16b},[x7] /* rk12 */
aesd v16.16b, v11.16b
eor v16.16b, v16.16b, v8.16b
sub x7, x7, #192 /* rewind x7 */
b 1f
.Laes256_dec_short_less_than_4_$i:
ldp q8,q9,[x7],32 /* rk12,rk13 */
aesd v16.16b, v11.16b
aesimc v16.16b, v16.16b
aesd v16.16b, v8.16b
aesimc v16.16b, v16.16b
ld1 {v10.16b},[x7] /* rk14 */
aesd v16.16b, v9.16b
eor v16.16b, v16.16b, v10.16b
sub x7, x7, #224
b 1f
.Laes128_dec_short_less_than_4_$i:
ld1 {v10.16b},[x7] /* rk10 */
aesd v16.16b,v9.16b
eor v16.16b, v16.16b, v10.16b
sub x7, x7, #160
1:
sub x11, x11, 1
eor v16.16b, v16.16b, v20.16b
ldr q20, [x0, #-16]
st1 {v16.16b}, [x1], #16
cbz x11, .Ldec_short_post_Q$i
___
}
$code.=<<___;
.Ldec_short_post_Q0:
eor v1.16b, v1.16b, v1.16b
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v1.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_Q1:
eor v2.16b, v2.16b, v2.16b
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v2.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_Q2:
eor v3.16b, v3.16b, v3.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v3.b[0], w12
b .Ldec_short_post_sha
.Ldec_short_post_sha:
/* we have last padded sha512 block now */
eor x13, x13, x13 /* length_lo */
eor x14, x14, x14 /* length_hi */
adds x13, x13, x2, lsl #3 /* add len in bits */
lsr x15, x2, #61
adc x14, x14, x15
adds x13, x13, #1024 /* add i_key_pad 1024 bits */
adc x14, x14, xzr
mov v7.d[0], x14
mov v7.d[1], x13
rev64 v7.16b, v7.16b
___
&sha512_block(1);
$code.=<<___;
/* Final HMAC - opad part */
mov v0.16b, v24.16b
mov v1.16b, v25.16b
mov v2.16b, v26.16b
mov v3.16b, v27.16b
eor v4.16b, v4.16b, v4.16b
eor v5.16b, v5.16b, v5.16b
eor v6.16b, v6.16b, v6.16b
eor v7.16b, v7.16b, v7.16b
mov v4.b[7], w12 /* padding 1 */
mov x13, #1024+512 /* length in bits */
mov v7.d[1], x13
/* load ABCDEFGH for opad */
ldr x7, [x6, #HMAC_OKEYPAD]
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
___
&sha512_block(0);
$code.=<<___;
.Ldec_ret:
mov x0, xzr /* return 0 */
rev64 v24.16b, v24.16b
rev64 v25.16b, v25.16b
rev64 v26.16b, v26.16b
rev64 v27.16b, v27.16b
/* store hash result */
st1 {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
/* restore callee save register */
ldp d10, d11, [sp,#16]
ldp d12, d13, [sp,#32]
ldp d14, d15, [sp,#48]
ldp d8, d9, [sp], #64
ret
.size asm_sha512_hmac_aescbc_dec, .-asm_sha512_hmac_aescbc_dec
___
}
#########################################
{ my %opcode = (
"sha512h" => 0xce608000, "sha512h2" => 0xce608400,
"sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
sub unsha512 {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}
}
open SELF,$0;
while(<SELF>) {
next if (/^#!/);
last if (!s/^#/\/\// and !/^$/);
print;
}
close SELF;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge;
print $_,"\n";
}
close STDOUT or die "error closing STDOUT: $!";