mirror of https://github.com/openssl/openssl.git
2967 lines
82 KiB
Perl
2967 lines
82 KiB
Perl
#! /usr/bin/env perl
|
|
|
|
# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
|
die "can't locate arm-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
*STDOUT=*OUT;
|
|
|
|
$code=<<___;
|
|
#include "arm_arch.h"
|
|
|
|
/* These are offsets into the CIPH_DIGEST struct */
|
|
#define CIPHER_KEY 0
|
|
#define CIPHER_KEY_ROUNDS 8
|
|
#define CIPHER_IV 16
|
|
#define HMAC_IKEYPAD 24
|
|
#define HMAC_OKEYPAD 32
|
|
|
|
.text
|
|
.arch armv8-a+crypto
|
|
___
|
|
|
|
sub aes_block_9_rounds() {
|
|
my $i = shift;
|
|
$code.=<<___;
|
|
/* aes block $i */
|
|
aese v$i.16b, v8.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v9.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v10.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v11.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v12.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v13.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v14.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v15.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
aese v$i.16b, v16.16b
|
|
aesmc v$i.16b, v$i.16b
|
|
___
|
|
}
|
|
|
|
sub aes_block_last_rounds () {
|
|
my $compare = shift;
|
|
my $label = shift;
|
|
my $i = shift;
|
|
my $load_rk10 = shift;
|
|
|
|
if($compare == 1) {
|
|
$code.=<<___;
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
b.lt .Laes128_${label}_$i
|
|
.Laes192_${label}_$i:
|
|
ldp q18,q19,[x7],32 /* rk[10],rk[11] */
|
|
aese v$i.16b,v17.16b
|
|
aesmc v$i.16b,v$i.16b
|
|
aese v$i.16b,v18.16b
|
|
aesmc v$i.16b,v$i.16b
|
|
b.gt .Laes256_${label}_$i
|
|
ld1 {v18.16b},[x7] /* rk[12] */
|
|
aese v$i.16b,v19.16b
|
|
eor v$i.16b,v$i.16b,v18.16b
|
|
sub x7, x7, #32 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_${label}_$i:
|
|
aese v$i.16b,v19.16b
|
|
aesmc v$i.16b,v$i.16b
|
|
ldp q18,q19,[x7],32 /* rk[12],rk[13] */
|
|
aese v$i.16b,v18.16b
|
|
aesmc v$i.16b,v$i.16b
|
|
ld1 {v18.16b},[x7] /* rk[14] */
|
|
aese v$i.16b,v19.16b
|
|
eor v$i.16b,v$i.16b,v18.16b
|
|
sub x7, x7, #64 /* rewind x7 */
|
|
b 1f
|
|
.Laes128_${label}_$i:
|
|
___
|
|
if ($load_rk10 == 1) {
|
|
$code.=<<___;
|
|
ld1 {v18.16b},[x7]
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
aese v$i.16b,v17.16b
|
|
eor v$i.16b,v$i.16b,v18.16b /* res */
|
|
1:
|
|
___
|
|
}
|
|
|
|
sub aes_block_dec_9_rounds() {
|
|
my $i = shift;
|
|
$code.=<<___;
|
|
/* aes block $i */
|
|
aesd v$i.16b, v8.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v9.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v10.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v11.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v12.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v13.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v14.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v15.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
aesd v$i.16b, v16.16b
|
|
aesimc v$i.16b, v$i.16b
|
|
___
|
|
}
|
|
|
|
sub aes_block_dec_last_rounds () {
|
|
my $compare = shift;
|
|
my $label = shift;
|
|
my $i = shift;
|
|
my $load_rk10 = shift;
|
|
|
|
if($compare == 1) {
|
|
$code.=<<___;
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
b.lt .Laes128_${label}_$i
|
|
.Laes192_${label}_$i:
|
|
ldp q18,q19,[x7],32 /* rk[10],rk[11] */
|
|
aesd v$i.16b,v17.16b
|
|
aesimc v$i.16b,v$i.16b
|
|
aesd v$i.16b,v18.16b
|
|
aesimc v$i.16b,v$i.16b
|
|
b.gt .Laes256_${label}_$i
|
|
ld1 {v18.16b},[x7] /* rk[12] */
|
|
aesd v$i.16b,v19.16b
|
|
eor v$i.16b,v$i.16b,v18.16b
|
|
sub x7, x7, #32 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_${label}_$i:
|
|
aesd v$i.16b,v19.16b
|
|
aesimc v$i.16b,v$i.16b
|
|
ldp q18,q19,[x7],32 /* rk[12],rk[13] */
|
|
aesd v$i.16b,v18.16b
|
|
aesimc v$i.16b,v$i.16b
|
|
ld1 {v18.16b},[x7] /* rk[14] */
|
|
aesd v$i.16b,v19.16b
|
|
eor v$i.16b,v$i.16b,v18.16b
|
|
sub x7, x7, #64 /* rewind x7 */
|
|
b 1f
|
|
.Laes128_${label}_$i:
|
|
___
|
|
if ($load_rk10 == 1) {
|
|
$code.=<<___;
|
|
ld1 {v18.16b},[x7]
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
aesd v$i.16b,v17.16b
|
|
eor v$i.16b,v$i.16b,v18.16b /* res */
|
|
1:
|
|
___
|
|
}
|
|
|
|
sub sha512_block() {
|
|
my @H = map("v$_",(24..28));
|
|
my @QH = map("q$_",(24..28));
|
|
my ($FG, $DE) = map("v$_",(29..30));
|
|
my ($QFG, $QDE) = map("q$_",(29..30));
|
|
my $M9_10 = "v31";
|
|
my @MSG = map("v$_", (0..7));
|
|
my ($W0, $W1) = ("v8", "v9");
|
|
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
|
|
my $need_revert = shift;
|
|
|
|
if($need_revert == 1) {
|
|
$code.=<<___;
|
|
rev64 @MSG[0].16b, @MSG[0].16b
|
|
rev64 @MSG[1].16b, @MSG[1].16b
|
|
rev64 @MSG[2].16b, @MSG[2].16b
|
|
rev64 @MSG[3].16b, @MSG[3].16b
|
|
rev64 @MSG[4].16b, @MSG[4].16b
|
|
rev64 @MSG[5].16b, @MSG[5].16b
|
|
rev64 @MSG[6].16b, @MSG[6].16b
|
|
rev64 @MSG[7].16b, @MSG[7].16b
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
/* load const k */
|
|
ld1 {$W0.2d}, [x10], #16
|
|
|
|
/* backup ABCDEFGH */
|
|
mov $AB.16b, @H[0].16b
|
|
mov $CD.16b, @H[1].16b
|
|
mov $EF.16b, @H[2].16b
|
|
mov $GH.16b, @H[3].16b
|
|
___
|
|
for($i = 0; $i < 32; $i++) {
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16)*/
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
}
|
|
for(;$i<40;$i++) {
|
|
$code.=<<___ if ($i<39);
|
|
ld1 {$W1.2d},[x10],#16
|
|
___
|
|
$code.=<<___ if ($i==39);
|
|
sub x10, x10, #80*8 // rewind
|
|
___
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
}
|
|
$code.=<<___;
|
|
add @H[0].2d, @H[0].2d, $AB.2d
|
|
add @H[1].2d, @H[1].2d, $CD.2d
|
|
add @H[2].2d, @H[2].2d, $EF.2d
|
|
add @H[3].2d, @H[3].2d, $GH.2d
|
|
___
|
|
}
|
|
|
|
{
|
|
my @H = map("v$_",(24..28));
|
|
my @QH = map("q$_",(24..28));
|
|
my ($FG, $DE) = map("v$_",(29..30));
|
|
my ($QFG, $QDE) = map("q$_",(29..30));
|
|
my $M9_10 = "v31";
|
|
my @MSG = map("v$_", (0..7));
|
|
my ($W0, $W1) = ("v14", "v15");
|
|
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
|
|
|
|
$code.=<<___;
|
|
/*
|
|
* asm_aescbc_sha512_hmac(
|
|
* csrc, x0 (cipher src address)
|
|
* cdst, x1 (cipher dst address)
|
|
* clen x2 (cipher length)
|
|
* dsrc, x3 (digest src address)
|
|
* ddst, x4 (digest dst address)
|
|
* dlen, x5 (digest length)
|
|
* arg x6 :
|
|
* arg->cipher.key (round keys)
|
|
* arg->cipher.key_rounds (key rounds)
|
|
* arg->cipher.iv (initialization vector)
|
|
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
|
|
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
|
|
* )
|
|
*/
|
|
|
|
.global asm_aescbc_sha512_hmac
|
|
.type asm_aescbc_sha512_hmac,%function
|
|
|
|
.align 6
|
|
.LK512:
|
|
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
|
|
.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
|
|
.quad 0x3956c25bf348b538,0x59f111f1b605d019
|
|
.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
|
|
.quad 0xd807aa98a3030242,0x12835b0145706fbe
|
|
.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
|
|
.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
|
|
.quad 0x9bdc06a725c71235,0xc19bf174cf692694
|
|
.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
|
|
.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
|
|
.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
|
|
.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
|
|
.quad 0x983e5152ee66dfab,0xa831c66d2db43210
|
|
.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
|
|
.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
|
|
.quad 0x06ca6351e003826f,0x142929670a0e6e70
|
|
.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
|
|
.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
|
|
.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
|
|
.quad 0x81c2c92e47edaee6,0x92722c851482353b
|
|
.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
|
|
.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
|
|
.quad 0xd192e819d6ef5218,0xd69906245565a910
|
|
.quad 0xf40e35855771202a,0x106aa07032bbd1b8
|
|
.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
|
|
.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
|
|
.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
|
|
.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
|
|
.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
|
|
.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
|
|
.quad 0x90befffa23631e28,0xa4506cebde82bde9
|
|
.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
|
|
.quad 0xca273eceea26619c,0xd186b8c721c0c207
|
|
.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
|
|
.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
|
|
.quad 0x113f9804bef90dae,0x1b710b35131c471b
|
|
.quad 0x28db77f523047d84,0x32caab7b40c72493
|
|
.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
|
|
.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
|
|
.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
|
|
.quad 0 // terminator
|
|
|
|
.align 4
|
|
asm_aescbc_sha512_hmac:
|
|
AARCH64_VALID_CALL_TARGET
|
|
/* save callee save register */
|
|
stp d8, d9, [sp,#-64]!
|
|
stp d10, d11, [sp,#16]
|
|
stp d12, d13, [sp,#32]
|
|
stp d14, d15, [sp,#48]
|
|
|
|
/* load ABCDEFGH */
|
|
ldr x7, [x6, #HMAC_IKEYPAD]
|
|
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
|
|
|
|
ldr x7, [x6, #CIPHER_KEY]
|
|
ldr x8, [x6, #CIPHER_IV]
|
|
ldr x9, [x6, #CIPHER_KEY_ROUNDS]
|
|
mov x12, x7 /* backup x7 */
|
|
|
|
adr x10, .LK512
|
|
|
|
lsr x11, x2, #4 /* aes_block = len/16 */
|
|
cbz x11, .Lret /* return if aes_block = 0 */
|
|
|
|
cmp x11, #16
|
|
b.lt .Lenc_short_case
|
|
|
|
ld1 {v0.16b}, [x0], #16 /* load plaintext */
|
|
ld1 {v1.16b}, [x8] /* load iv */
|
|
|
|
eor v0.16b, v0.16b, v1.16b /* iv xor plaintext */
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
/* block 0 */
|
|
aese v0.16b, v8.16b
|
|
aesmc v0.16b, v0.16b
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
aese v0.16b, v9.16b
|
|
aesmc v0.16b, v0.16b
|
|
aese v0.16b, v10.16b
|
|
aesmc v0.16b, v0.16b
|
|
ldp q12, q13, [x7], #32 /* rk4, rk5 */
|
|
aese v0.16b, v11.16b
|
|
aesmc v0.16b, v0.16b
|
|
aese v0.16b, v12.16b
|
|
aesmc v0.16b, v0.16b
|
|
ldp q14, q15, [x7], #32 /* rk6, rk7 */
|
|
aese v0.16b, v13.16b
|
|
aesmc v0.16b, v0.16b
|
|
aese v0.16b, v14.16b
|
|
aesmc v0.16b, v0.16b
|
|
ldp q16, q17, [x7], #32 /* rk8, rk9 */
|
|
aese v0.16b, v15.16b
|
|
aesmc v0.16b, v0.16b
|
|
aese v0.16b, v16.16b
|
|
aesmc v0.16b, v0.16b
|
|
ld1 {v18.16b}, [x7] /* rk10 */
|
|
___
|
|
&aes_block_last_rounds(1, "enc_prelog", 0, 0);
|
|
$code.=<<___;
|
|
str q0, [x1], #16 /* store cipher result */
|
|
ld1 {v1.16b}, [x0], #16 /* load next block */
|
|
eor v1.16b, v1.16b, v0.16b /* output xor block */
|
|
___
|
|
# process aes blocks from 1 to 7
|
|
for($i = 1; $i < 8; $i = $i + 1) {
|
|
&aes_block_9_rounds($i);
|
|
&aes_block_last_rounds(0, "enc_prelog", $i, 0);
|
|
if($i != 7) {
|
|
$next = $i + 1;
|
|
$code.=<<___;
|
|
/* load next block */
|
|
ld1 {v$next.16b}, [x0], #16
|
|
/* output xor block */
|
|
eor v$next.16b, v$next.16b, v$i.16b
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
str q$i, [x1], #16 /* store cipher result */
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
sub x11, x11, #8
|
|
|
|
.Lenc_main_loop:
|
|
mov x7, x12
|
|
mov x14, x1
|
|
/* aes block 0 */
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v12.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v7.16b
|
|
|
|
/* reverse message */
|
|
rev64 @MSG[0].16b, @MSG[0].16b
|
|
rev64 @MSG[1].16b, @MSG[1].16b
|
|
rev64 @MSG[2].16b, @MSG[2].16b
|
|
rev64 @MSG[3].16b, @MSG[3].16b
|
|
rev64 @MSG[4].16b, @MSG[4].16b
|
|
rev64 @MSG[5].16b, @MSG[5].16b
|
|
rev64 @MSG[6].16b, @MSG[6].16b
|
|
rev64 @MSG[7].16b, @MSG[7].16b
|
|
ld1 {$W0.2d}, [x10], #16 /* load const k*/
|
|
|
|
/* backup ABCDEFGH */
|
|
mov $AB.16b, @H[0].16b
|
|
mov $CD.16b, @H[1].16b
|
|
mov $EF.16b, @H[2].16b
|
|
mov $GH.16b, @H[3].16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
cmp x9, #12
|
|
b.lt .Lenc_main_loop_aes128_0
|
|
.Lenc_main_loop_aes192_0:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_0
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_0:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_0:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 1 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
cmp x9, #12
|
|
b.lt .Lenc_main_loop_aes128_1
|
|
.Lenc_main_loop_aes192_1:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_1
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_1:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_1:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 2 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
cmp x9, #12
|
|
b.lt .Lenc_main_loop_aes128_2
|
|
.Lenc_main_loop_aes192_2:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_2
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_2:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_2:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 3 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
cmp x9, #12
|
|
b.lt .Lenc_main_loop_aes128_3
|
|
.Lenc_main_loop_aes192_3:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_3
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_3:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_3:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 4 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
cmp x9, #12
|
|
b.lt .Lenc_main_loop_aes128_4
|
|
.Lenc_main_loop_aes192_4:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_4
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_4:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_4:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 5 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
cmp x9, #12
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
b.lt .Lenc_main_loop_aes128_5
|
|
.Lenc_main_loop_aes192_5:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_5
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_5:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_5:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 6 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ld1 {$W1.2d}, [x10], #16 /* load const k*/
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
ext $M9_10.16b, @MSG[4].16b, @MSG[5].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
/* Wt_PART1 = SSIG0(W(t-15)) + W(t-16) */
|
|
sha512su0 @MSG[0].2d, @MSG[1].2d
|
|
/* Wt = SSIG1(W(t-2)) + W(t-7) + Wt_PART1 */
|
|
sha512su1 @MSG[0].2d, @MSG[7].2d, $M9_10.2d
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
cmp x9, #12
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
b.lt .Lenc_main_loop_aes128_6
|
|
.Lenc_main_loop_aes192_6:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_6
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_6:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_6:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
st1 {v12.16b}, [x1], #16
|
|
/* aes block 7 */
|
|
mov x7, x12
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ld1 {v13.16b}, [x0], #16
|
|
eor v12.16b, v12.16b, v13.16b
|
|
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
ld1 {$W1.2d},[x10],#16
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
sub x10, x10, #80*8 // rewind
|
|
add $W0.2d, $W0.2d, $MSG[0].2d /* Kt + Wt */
|
|
ext $W0.16b, $W0.16b, $W0.16b, #8
|
|
ext $FG.16b, @H[2].16b, @H[3].16b, #8
|
|
ext $DE.16b, @H[1].16b, @H[2].16b, #8
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
/* T1 = h + Kt + Wt*/
|
|
add @H[3].2d, @H[3].2d, $W0.2d
|
|
/* T1 = T1 + BSIG1(e) + CH(e,f,g) */
|
|
sha512h @QH[3], $QFG, $DE.2d
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
cmp x9, #12
|
|
add @H[4].2d, @H[1].2d, @H[3].2d /* d + T1 */
|
|
/* T2 = BSIG0(a) + MAJ(a,b,c), T1 + T2 */
|
|
sha512h2 @QH[3], @QH[1], @H[0].2d
|
|
___
|
|
($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
|
|
# h=g, g=f,f=e,e=d+T1,d=c,c=b,b=a,a=T1+T2
|
|
@H = (@H[3],@H[0],@H[4],@H[2],@H[1]);
|
|
@QH = (@QH[3],@QH[0],@QH[4],@QH[2],@QH[1]);
|
|
$code.=<<___;
|
|
b.lt .Lenc_main_loop_aes128_7
|
|
.Lenc_main_loop_aes192_7:
|
|
ldp q10, q11, [x7], #32 /* rk10, rk11 */
|
|
aese v12.16b, v9.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v10.16b
|
|
aesmc v12.16b, v12.16b
|
|
b.gt .Lenc_main_loop_aes256_7
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aese v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes256_7:
|
|
ldp q8, q9, [x7], #32 /* rk12, rk13 */
|
|
aese v12.16b, v11.16b
|
|
aesmc v12.16b, v12.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aese v12.16b, v8.16b
|
|
aesmc v12.16b, v12.16b
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
b 1f
|
|
.Lenc_main_loop_aes128_7:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aese v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
1:
|
|
add @H[0].2d, @H[0].2d, $AB.2d
|
|
add @H[1].2d, @H[1].2d, $CD.2d
|
|
add @H[2].2d, @H[2].2d, $EF.2d
|
|
add @H[3].2d, @H[3].2d, $GH.2d
|
|
|
|
st1 {v12.16b}, [x1], #16
|
|
|
|
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x14], #64
|
|
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x14]
|
|
|
|
sub x11, x11, #8
|
|
cmp x11, #8
|
|
b.ge .Lenc_main_loop
|
|
|
|
/* epilog - process sha block */
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
mov x7, x12
|
|
ld1 {v0.16b}, [x0], #16 /* load plaintext */
|
|
ldr q1, [x14, #48] /* load the last output of aes block */
|
|
eor v0.16b, v0.16b, v1.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
ldp q12, q13, [x7], #32 /* rk4, rk5 */
|
|
ldp q14, q15, [x7], #32 /* rk6, rk7 */
|
|
ldp q16, q17, [x7], #32 /* rk8, rk9 */
|
|
ld1 {v18.16b}, [x7] /* rk10 */
|
|
|
|
mov w12, #0x80 /* sha padding 0b10000000 */
|
|
b .Lenc_less_than_8_block
|
|
|
|
/* aes_block < 16 */
|
|
.Lenc_short_case:
|
|
ld1 {v0.16b}, [x0], #16 /* load plaintext */
|
|
ld1 {v1.16b}, [x8] /* load iv */
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
ldp q12, q13, [x7], #32 /* rk4, rk5 */
|
|
ldp q14, q15, [x7], #32 /* rk6, rk7 */
|
|
ldp q16, q17, [x7], #32 /* rk8, rk9 */
|
|
ld1 {v18.16b}, [x7] /* rk10 */
|
|
mov w12, #0x80 /* sha padding 0b10000000 */
|
|
|
|
eor v0.16b, v0.16b, v1.16b /* iv xor plaintext */
|
|
|
|
cmp x11, #8
|
|
b.lt .Lenc_less_than_8_block
|
|
___
|
|
# process 8 aes blocks
|
|
for($i = 0; $i < 8; $i = $i + 1) {
|
|
&aes_block_9_rounds($i);
|
|
# only tell 128/192/256 at the first time
|
|
&aes_block_last_rounds(($i == 0)?1:0, "enc_short", $i, 0);
|
|
if($i != 7) {
|
|
$next = $i + 1;
|
|
$code.=<<___;
|
|
/* load next block */
|
|
ld1 {v$next.16b}, [x0], #16
|
|
/* output xor block */
|
|
eor v$next.16b, v$next.16b, v$i.16b
|
|
___
|
|
}
|
|
}
|
|
$code.=<<___;
|
|
/* store 8 blocks of ciphertext */
|
|
stp q0, q1, [x1], #32
|
|
stp q2, q3, [x1], #32
|
|
stp q4, q5, [x1], #32
|
|
stp q6, q7, [x1], #32
|
|
|
|
sub x11, x11, #8
|
|
___
|
|
# now we have a whole sha512 block
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
ldr x7, [x6, #CIPHER_KEY]
|
|
ldp q8, q9, [x7] /* restore clobbered rk0, rk1 */
|
|
add x7, x7, #160 /* x7 point to rk10 */
|
|
cbz x11, .Lenc_short_no_more_aes_block
|
|
ld1 {v0.16b}, [x0], #16 /* load plaintext */
|
|
ldr q1, [x1, -16]
|
|
eor v0.16b, v0.16b, v1.16b
|
|
.Lenc_less_than_8_block:
|
|
cbz x11, .Lenc_short_no_more_aes_block
|
|
___
|
|
# process remained aes blocks (<= 7)
|
|
for($i = 0; $i < 7; $i = $i + 1) {
|
|
&aes_block_9_rounds($i);
|
|
&aes_block_last_rounds(($i == 0)?1:0, "enc_short_partial", $i, 0);
|
|
$code.=<<___;
|
|
str q$i, [x1], #16
|
|
sub x11, x11, #1
|
|
cbz x11, .Lenc_short_post_Q$i
|
|
___
|
|
if($i != 6) {
|
|
$next = $i + 1;
|
|
$code.=<<___;
|
|
/* load next block*/
|
|
ld1 {v$next.16b}, [x0], #16
|
|
/* output xor block */
|
|
eor v$next.16b, v$next.16b, v$i.16b
|
|
___
|
|
}
|
|
}
|
|
$code.=<<___;
|
|
.Lenc_short_no_more_aes_block:
|
|
eor v0.16b, v0.16b, v0.16b
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v0.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q0:
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v1.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q1:
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v2.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q2:
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v3.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q3:
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v4.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q4:
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v5.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q5:
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v6.b[0], w12
|
|
b .Lenc_short_post_sha
|
|
.Lenc_short_post_Q6:
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v7.b[0], w12
|
|
/* we have one padded sha512 block now, process it and
|
|
then employ another one to host sha length */
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
eor v0.16b, v0.16b, v0.16b
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
.Lenc_short_post_sha:
|
|
/* we have last padded sha512 block now */
|
|
eor x13, x13, x13 /* length_lo */
|
|
eor x14, x14, x14 /* length_hi */
|
|
|
|
adds x13, x13, x2, lsl #3 /* add len in bits */
|
|
lsr x15, x2, #61
|
|
adc x14, x14, x15
|
|
|
|
adds x13, x13, #1024 /* add i_key_pad 1024 bits */
|
|
adc x14, x14, xzr
|
|
|
|
mov v7.d[0], x14
|
|
mov v7.d[1], x13
|
|
rev64 v7.16b, v7.16b
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
/* Final HMAC - opad part */
|
|
mov v0.16b, v24.16b
|
|
mov v1.16b, v25.16b
|
|
mov v2.16b, v26.16b
|
|
mov v3.16b, v27.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
|
|
mov v4.b[7], w12 /* padding 1 */
|
|
mov x13, #1024+512 /* length in bits */
|
|
mov v7.d[1], x13
|
|
|
|
/* load ABCDEFGH for opad */
|
|
ldr x7, [x6, #HMAC_OKEYPAD]
|
|
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
|
|
___
|
|
&sha512_block(0);
|
|
$code.=<<___;
|
|
.Lret:
|
|
mov x0, xzr /* return 0 */
|
|
|
|
rev64 v24.16b, v24.16b
|
|
rev64 v25.16b, v25.16b
|
|
rev64 v26.16b, v26.16b
|
|
rev64 v27.16b, v27.16b
|
|
|
|
/* store hash result */
|
|
st1 {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
|
|
|
|
/* restore callee save register */
|
|
ldp d10, d11, [sp,#16]
|
|
ldp d12, d13, [sp,#32]
|
|
ldp d14, d15, [sp,#48]
|
|
ldp d8, d9, [sp], #64
|
|
ret
|
|
.size asm_aescbc_sha512_hmac, .-asm_aescbc_sha512_hmac
|
|
___
|
|
}
|
|
|
|
{
|
|
my @H = map("v$_",(24..28));
|
|
my @QH = map("q$_",(24..28));
|
|
my ($FG, $DE) = map("v$_",(29..30));
|
|
my ($QFG, $QDE) = map("q$_",(29..30));
|
|
my $M9_10 = "v31";
|
|
my @MSG = map("v$_", (0..7));
|
|
my ($W0, $W1) = ("v14", "v15");
|
|
my ($AB, $CD, $EF, $GH) = map("v$_",(20..23));
|
|
|
|
$code.=<<___;
|
|
/*
|
|
* asm_sha512_hmac_aescbc_dec(
|
|
* csrc, x0 (cipher src address)
|
|
* cdst, x1 (cipher dst address)
|
|
* clen x2 (cipher length)
|
|
* dsrc, x3 (digest src address)
|
|
* ddst, x4 (digest dst address)
|
|
* dlen, x5 (digest length)
|
|
* arg x6 :
|
|
* arg->cipher.key (round keys)
|
|
* arg->cipher.key_rounds (key rounds)
|
|
* arg->cipher.iv (initialization vector)
|
|
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
|
|
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
|
|
* )
|
|
*/
|
|
|
|
.global asm_sha512_hmac_aescbc_dec
|
|
.type asm_sha512_hmac_aescbc_dec,%function
|
|
|
|
.align 4
|
|
asm_sha512_hmac_aescbc_dec:
|
|
AARCH64_VALID_CALL_TARGET
|
|
/* save callee save register */
|
|
stp d8, d9, [sp,#-64]!
|
|
stp d10, d11, [sp,#16]
|
|
stp d12, d13, [sp,#32]
|
|
stp d14, d15, [sp,#48]
|
|
|
|
/* load ABCDEFGH */
|
|
ldr x7, [x6, #HMAC_IKEYPAD]
|
|
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
|
|
|
|
ldr x7, [x6, #CIPHER_KEY]
|
|
ldr x8, [x6, #CIPHER_IV]
|
|
ldr x9, [x6, #CIPHER_KEY_ROUNDS]
|
|
mov x12, x7 /* backup x7 */
|
|
|
|
adr x10, .LK512
|
|
|
|
lsr x11, x2, #4 /* aes_block = len/16 */
|
|
cbz x11, .Ldec_ret /* return if aes_block = 0 */
|
|
|
|
ld1 {v20.16b}, [x8] /* load iv */
|
|
cmp x11, #8
|
|
b.lt .Ldec_short_case
|
|
.Ldec_main_loop:
|
|
ldp q12, q13, [x0], #32
|
|
ldp q14, q15, [x0], #32
|
|
ldp q16, q17, [x0], #32
|
|
ldp q18, q19, [x0], #32
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
mov v0.16b, v12.16b
|
|
mov v1.16b, v13.16b
|
|
mov v2.16b, v14.16b
|
|
mov v3.16b, v15.16b
|
|
mov v4.16b, v16.16b
|
|
mov v5.16b, v17.16b
|
|
mov v6.16b, v18.16b
|
|
mov v7.16b, v19.16b
|
|
|
|
/* 1 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v8.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v8.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v8.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 2 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v9.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v9.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v9.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
|
|
/* 3 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v10.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v10.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v10.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 4 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v11.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v11.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v11.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
|
|
/* 5 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v8.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v8.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v8.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 6 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v9.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v9.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v9.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
|
|
/* 7 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v10.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v10.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v10.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 8 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v11.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v11.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v11.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 9 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v8.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v8.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v8.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
|
|
b.lt .Laes128_dec_main
|
|
.Laes192_dec_main:
|
|
ldp q10,q11,[x7],32 /* rk10,rk11 */
|
|
/* 10 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v9.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v9.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v9.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 11 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v10.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v10.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v10.16b
|
|
aesimc v19.16b, v19.16b
|
|
b.gt .Laes256_dec_main
|
|
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
|
|
/*12 round */
|
|
aesd v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
aesd v13.16b, v11.16b
|
|
eor v13.16b, v13.16b, v8.16b
|
|
aesd v14.16b, v11.16b
|
|
eor v14.16b, v14.16b, v8.16b
|
|
aesd v15.16b, v11.16b
|
|
eor v15.16b, v15.16b, v8.16b
|
|
aesd v16.16b, v11.16b
|
|
eor v16.16b, v16.16b, v8.16b
|
|
aesd v17.16b, v11.16b
|
|
eor v17.16b, v17.16b, v8.16b
|
|
aesd v18.16b, v11.16b
|
|
eor v18.16b, v18.16b, v8.16b
|
|
aesd v19.16b, v11.16b
|
|
eor v19.16b, v19.16b, v8.16b
|
|
|
|
sub x7, x7, #192 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_dec_main:
|
|
ldp q8,q9,[x7],32 /* rk12,rk13 */
|
|
/* 12 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v11.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v11.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v11.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
/* 13 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v17.16b, v8.16b
|
|
aesimc v17.16b, v17.16b
|
|
aesd v18.16b, v8.16b
|
|
aesimc v18.16b, v18.16b
|
|
aesd v19.16b, v8.16b
|
|
aesimc v19.16b, v19.16b
|
|
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
|
|
/* 14 round */
|
|
aesd v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
aesd v13.16b, v9.16b
|
|
eor v13.16b, v13.16b, v10.16b
|
|
aesd v14.16b, v9.16b
|
|
eor v14.16b, v14.16b, v10.16b
|
|
aesd v15.16b, v9.16b
|
|
eor v15.16b, v15.16b, v10.16b
|
|
aesd v16.16b, v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
aesd v17.16b, v9.16b
|
|
eor v17.16b, v17.16b, v10.16b
|
|
aesd v18.16b, v9.16b
|
|
eor v18.16b, v18.16b, v10.16b
|
|
aesd v19.16b, v9.16b
|
|
eor v19.16b, v19.16b, v10.16b
|
|
|
|
sub x7, x7, #224
|
|
b 1f
|
|
.Laes128_dec_main:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aesd v12.16b,v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
aesd v13.16b,v9.16b
|
|
eor v13.16b, v13.16b, v10.16b
|
|
aesd v14.16b,v9.16b
|
|
eor v14.16b, v14.16b, v10.16b
|
|
aesd v15.16b,v9.16b
|
|
eor v15.16b, v15.16b, v10.16b
|
|
aesd v16.16b,v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
aesd v17.16b,v9.16b
|
|
eor v17.16b, v17.16b, v10.16b
|
|
aesd v18.16b,v9.16b
|
|
eor v18.16b, v18.16b, v10.16b
|
|
aesd v19.16b,v9.16b
|
|
eor v19.16b, v19.16b, v10.16b
|
|
sub x7, x7, #160
|
|
|
|
1:
|
|
eor v12.16b, v12.16b, v20.16b
|
|
eor v13.16b, v13.16b, v0.16b
|
|
eor v14.16b, v14.16b, v1.16b
|
|
eor v15.16b, v15.16b, v2.16b
|
|
eor v16.16b, v16.16b, v3.16b
|
|
eor v17.16b, v17.16b, v4.16b
|
|
eor v18.16b, v18.16b, v5.16b
|
|
eor v19.16b, v19.16b, v6.16b
|
|
|
|
stp q12,q13, [x1], #32
|
|
ldr q12, [x0, #-16] /* load last cipher */
|
|
stp q14,q15, [x1], #32
|
|
stp q16,q17, [x1], #32
|
|
stp q18,q19, [x1], #32
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
mov v20.16b, v12.16b /* load last cipher */
|
|
sub x11, x11, #8
|
|
cmp x11, #8
|
|
b.ge .Ldec_main_loop
|
|
|
|
/* aes_block < 8 */
|
|
.Ldec_short_case:
|
|
mov w12, #0x80 /* sha padding 0b10000000 */
|
|
cbnz x11, 1f
|
|
eor v0.16b, v0.16b, v0.16b
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v0.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
1:
|
|
cmp x11, #4
|
|
b.lt .Ldec_less_than_4_block
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
ldp q12, q13, [x0], #32
|
|
ldp q14, q15, [x0], #32
|
|
|
|
mov v0.16b, v12.16b
|
|
mov v1.16b, v13.16b
|
|
mov v2.16b, v14.16b
|
|
mov v3.16b, v15.16b
|
|
|
|
/* 1 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 2 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
|
|
/* 3 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 4 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
|
|
/* 5 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 6 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
|
|
/* 7 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 8 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 9 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
|
|
b.lt .Laes128_dec_short
|
|
.Laes192_dec_short:
|
|
ldp q10,q11,[x7],32 /* rk10,rk11 */
|
|
|
|
/* 10 round */
|
|
aesd v12.16b, v9.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v9.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v9.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v9.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 11 round */
|
|
aesd v12.16b, v10.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v10.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v10.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v10.16b
|
|
aesimc v15.16b, v15.16b
|
|
b.gt .Laes256_dec_short
|
|
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
|
|
/*12 round */
|
|
aesd v12.16b, v11.16b
|
|
eor v12.16b, v12.16b, v8.16b
|
|
aesd v13.16b, v11.16b
|
|
eor v13.16b, v13.16b, v8.16b
|
|
aesd v14.16b, v11.16b
|
|
eor v14.16b, v14.16b, v8.16b
|
|
aesd v15.16b, v11.16b
|
|
eor v15.16b, v15.16b, v8.16b
|
|
|
|
sub x7, x7, #192 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_dec_short:
|
|
ldp q8,q9,[x7],32 /* rk12,rk13 */
|
|
/* 12 round */
|
|
aesd v12.16b, v11.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v11.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v11.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v11.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
/* 13 round */
|
|
aesd v12.16b, v8.16b
|
|
aesimc v12.16b, v12.16b
|
|
aesd v13.16b, v8.16b
|
|
aesimc v13.16b, v13.16b
|
|
aesd v14.16b, v8.16b
|
|
aesimc v14.16b, v14.16b
|
|
aesd v15.16b, v8.16b
|
|
aesimc v15.16b, v15.16b
|
|
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
|
|
/* 14 round */
|
|
aesd v12.16b, v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
aesd v13.16b, v9.16b
|
|
eor v13.16b, v13.16b, v10.16b
|
|
aesd v14.16b, v9.16b
|
|
eor v14.16b, v14.16b, v10.16b
|
|
aesd v15.16b, v9.16b
|
|
eor v15.16b, v15.16b, v10.16b
|
|
|
|
sub x7, x7, #224
|
|
b 1f
|
|
.Laes128_dec_short:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aesd v12.16b,v9.16b
|
|
eor v12.16b, v12.16b, v10.16b
|
|
aesd v13.16b,v9.16b
|
|
eor v13.16b, v13.16b, v10.16b
|
|
aesd v14.16b,v9.16b
|
|
eor v14.16b, v14.16b, v10.16b
|
|
aesd v15.16b,v9.16b
|
|
eor v15.16b, v15.16b, v10.16b
|
|
sub x7, x7, #160
|
|
1:
|
|
eor v12.16b, v12.16b, v20.16b
|
|
eor v13.16b, v13.16b, v0.16b
|
|
eor v14.16b, v14.16b, v1.16b
|
|
eor v15.16b, v15.16b, v2.16b
|
|
ldr q20, [x0, #-16]
|
|
|
|
sub x11, x11, #4
|
|
|
|
stp q12,q13, [x1], #32
|
|
stp q14,q15, [x1], #32
|
|
cbz x11, .Ldec_short_post_Q3
|
|
___
|
|
for($i = 0; $i < 3; $i = $i + 1) {
|
|
$block = $i + 4;
|
|
$code.=<<___;
|
|
ld1 {v16.16b}, [x0], #16
|
|
mov v$block.16b, v16.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
b.lt .Laes128_dec_short_$block
|
|
.Laes192_dec_short_$block:
|
|
ldp q10,q11,[x7],32 /* rk10,rk11 */
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
b.gt .Laes256_dec_short_$block
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aesd v16.16b, v11.16b
|
|
eor v16.16b, v16.16b, v8.16b
|
|
sub x7, x7, #192 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_dec_short_$block:
|
|
ldp q8,q9,[x7],32 /* rk12,rk13 */
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aesd v16.16b, v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
sub x7, x7, #224
|
|
b 1f
|
|
.Laes128_dec_short_$block:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aesd v16.16b,v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
sub x7, x7, #160
|
|
1:
|
|
sub x11, x11, 1
|
|
eor v16.16b, v16.16b, v20.16b
|
|
ldr q20, [x0, #-16]
|
|
st1 {v16.16b}, [x1], #16
|
|
cbz x11, .Ldec_short_post_Q$block
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.Ldec_short_post_Q3:
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v4.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_Q4:
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v5.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_Q5:
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v6.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_Q6:
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v7.b[0], w12
|
|
/* we have one padded sha512 block now, process it and
|
|
then employ another one to host sha length */
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
eor v0.16b, v0.16b, v0.16b
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
b .Ldec_short_post_sha
|
|
|
|
.Ldec_less_than_4_block:
|
|
___
|
|
for($i = 0; $i < 3; $i = $i + 1) {
|
|
$code.=<<___;
|
|
ld1 {v16.16b}, [x0], #16
|
|
mov v$i.16b, v16.16b
|
|
|
|
ldp q8, q9, [x7], #32 /* rk0, rk1 */
|
|
ldp q10, q11, [x7], #32 /* rk2, rk3 */
|
|
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q8, q9, [x7], #32 /* rk4, rk5 */
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q10, q11, [x7], #32 /* rk6, rk7 */
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
ldp q8, q9, [x7], #32 /* rk8, rk9 */
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
cmp x9, #12 /* tell 128,192,256 apart */
|
|
b.lt .Laes128_dec_short_less_than_4_$i
|
|
.Laes192_dec_short_less_than_4_$i:
|
|
ldp q10,q11,[x7],32 /* rk10,rk11 */
|
|
aesd v16.16b, v9.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v10.16b
|
|
aesimc v16.16b, v16.16b
|
|
b.gt .Laes256_dec_short_less_than_4_$i
|
|
ld1 {v8.16b},[x7] /* rk12 */
|
|
aesd v16.16b, v11.16b
|
|
eor v16.16b, v16.16b, v8.16b
|
|
sub x7, x7, #192 /* rewind x7 */
|
|
b 1f
|
|
.Laes256_dec_short_less_than_4_$i:
|
|
ldp q8,q9,[x7],32 /* rk12,rk13 */
|
|
aesd v16.16b, v11.16b
|
|
aesimc v16.16b, v16.16b
|
|
aesd v16.16b, v8.16b
|
|
aesimc v16.16b, v16.16b
|
|
ld1 {v10.16b},[x7] /* rk14 */
|
|
aesd v16.16b, v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
sub x7, x7, #224
|
|
b 1f
|
|
.Laes128_dec_short_less_than_4_$i:
|
|
ld1 {v10.16b},[x7] /* rk10 */
|
|
aesd v16.16b,v9.16b
|
|
eor v16.16b, v16.16b, v10.16b
|
|
sub x7, x7, #160
|
|
1:
|
|
sub x11, x11, 1
|
|
eor v16.16b, v16.16b, v20.16b
|
|
ldr q20, [x0, #-16]
|
|
st1 {v16.16b}, [x1], #16
|
|
cbz x11, .Ldec_short_post_Q$i
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.Ldec_short_post_Q0:
|
|
eor v1.16b, v1.16b, v1.16b
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v1.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_Q1:
|
|
eor v2.16b, v2.16b, v2.16b
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v2.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_Q2:
|
|
eor v3.16b, v3.16b, v3.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
mov v3.b[0], w12
|
|
b .Ldec_short_post_sha
|
|
.Ldec_short_post_sha:
|
|
/* we have last padded sha512 block now */
|
|
eor x13, x13, x13 /* length_lo */
|
|
eor x14, x14, x14 /* length_hi */
|
|
|
|
adds x13, x13, x2, lsl #3 /* add len in bits */
|
|
lsr x15, x2, #61
|
|
adc x14, x14, x15
|
|
|
|
adds x13, x13, #1024 /* add i_key_pad 1024 bits */
|
|
adc x14, x14, xzr
|
|
|
|
mov v7.d[0], x14
|
|
mov v7.d[1], x13
|
|
rev64 v7.16b, v7.16b
|
|
___
|
|
&sha512_block(1);
|
|
$code.=<<___;
|
|
/* Final HMAC - opad part */
|
|
mov v0.16b, v24.16b
|
|
mov v1.16b, v25.16b
|
|
mov v2.16b, v26.16b
|
|
mov v3.16b, v27.16b
|
|
eor v4.16b, v4.16b, v4.16b
|
|
eor v5.16b, v5.16b, v5.16b
|
|
eor v6.16b, v6.16b, v6.16b
|
|
eor v7.16b, v7.16b, v7.16b
|
|
|
|
mov v4.b[7], w12 /* padding 1 */
|
|
mov x13, #1024+512 /* length in bits */
|
|
mov v7.d[1], x13
|
|
|
|
/* load ABCDEFGH for opad */
|
|
ldr x7, [x6, #HMAC_OKEYPAD]
|
|
ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [x7]
|
|
___
|
|
&sha512_block(0);
|
|
$code.=<<___;
|
|
.Ldec_ret:
|
|
mov x0, xzr /* return 0 */
|
|
|
|
rev64 v24.16b, v24.16b
|
|
rev64 v25.16b, v25.16b
|
|
rev64 v26.16b, v26.16b
|
|
rev64 v27.16b, v27.16b
|
|
|
|
/* store hash result */
|
|
st1 {v24.2d,v25.2d,v26.2d,v27.2d},[x4]
|
|
|
|
/* restore callee save register */
|
|
ldp d10, d11, [sp,#16]
|
|
ldp d12, d13, [sp,#32]
|
|
ldp d14, d15, [sp,#48]
|
|
ldp d8, d9, [sp], #64
|
|
ret
|
|
.size asm_sha512_hmac_aescbc_dec, .-asm_sha512_hmac_aescbc_dec
|
|
___
|
|
}
|
|
#########################################
|
|
{ my %opcode = (
|
|
"sha512h" => 0xce608000, "sha512h2" => 0xce608400,
|
|
"sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 );
|
|
|
|
sub unsha512 {
|
|
my ($mnemonic,$arg)=@_;
|
|
|
|
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
|
|
&&
|
|
sprintf ".inst\t0x%08x\t//%s %s",
|
|
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
|
|
$mnemonic,$arg;
|
|
}
|
|
}
|
|
|
|
open SELF,$0;
|
|
while(<SELF>) {
|
|
next if (/^#!/);
|
|
last if (!s/^#/\/\// and !/^$/);
|
|
print;
|
|
}
|
|
close SELF;
|
|
|
|
foreach(split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge;
|
|
print $_,"\n";
|
|
}
|
|
|
|
close STDOUT or die "error closing STDOUT: $!"; |