openssl/crypto/aes/asm/aes-sha256-armv8.pl

4658 lines
124 KiB
Perl

#! /usr/bin/env perl
# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (C) Cavium networks Ltd. 2016.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#========================================================================
# Derived from following files in
# https://github.com/ARM-software/AArch64cryptolib
# AArch64cryptolib_opt_big/aes_cbc_sha256/aes128cbc_sha256_hmac.S
# AArch64cryptolib_opt_big/aes_cbc_sha256/sha256_hmac_aes128cbc_dec.S
#========================================================================
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour \"$output\""
or die "can't call $xlate: $!";
*STDOUT=*OUT;
$code=<<___;
#include "arm_arch.h"
/* These are offsets into the CIPH_DIGEST struct */
#define CIPHER_KEY 0
#define CIPHER_KEY_ROUNDS 8
#define CIPHER_IV 16
#define HMAC_IKEYPAD 24
#define HMAC_OKEYPAD 32
.text
.arch armv8-a+crypto
___
sub aes192_aes256_handle () {
my $compare = shift;
my $label = shift;
my $i = shift;
my $load_rk10 = shift;
if($compare == 1) {
$code.=<<___;
cmp x16,#12
___
}
$code.=<<___;
b.lt .Laes128_${label}_$i
.Laes192_${label}_$i:
ldp q30,q31,[x17],32 /* rk[10],rk[11] */
aese v$i.16b,v17.16b
aesmc v$i.16b,v$i.16b
aese v$i.16b,v30.16b
aesmc v$i.16b,v$i.16b
b.gt .Laes256_${label}_$i
ld1 {v30.16b},[x17] /* rk[12] */
aese v$i.16b,v31.16b
eor v$i.16b,v$i.16b,v30.16b
sub x17, x17, #32 /* rewind x17 */
b 1f
.Laes256_${label}_$i:
aese v$i.16b,v31.16b
aesmc v$i.16b,v$i.16b
ldp q30,q31,[x17],32 /* rk[12],rk[13] */
aese v$i.16b,v30.16b
aesmc v$i.16b,v$i.16b
ld1 {v30.16b},[x17] /* rk[14] */
aese v$i.16b,v31.16b
eor v$i.16b,v$i.16b,v30.16b
sub x17, x17, #64 /* rewind x17 */
b 1f
.Laes128_${label}_$i:
___
if ($load_rk10 == 1) {
$code.=<<___;
ld1 {v18.16b},[x9]
___
}
$code.=<<___;
aese v$i.16b,v17.16b
eor v$i.16b,v$i.16b,v18.16b /* res 0 */
1:
___
}
sub aes192_aes256_dec_handle () {
my $compare = shift;
my $label = shift;
my $i = shift;
my $load_rk10 = shift;
if($compare == 1) {
$code.=<<___;
cmp x16,#12
___
}
$code.=<<___;
b.lt .Laes128_${label}_$i
.Laes192_${label}_$i:
stp q19,q23,[sp, #-32]!
ld1 {v19.16b},[x17],16 /* rk[10] */
ld1 {v23.16b},[x17],16 /* rk[11] */
aesd v$i.16b,v17.16b
aesimc v$i.16b,v$i.16b
aesd v$i.16b,v19.16b
aesimc v$i.16b,v$i.16b
b.gt .Laes256_${label}_$i
ld1 {v19.16b},[x17] /* rk[12] */
aesd v$i.16b,v23.16b
eor v$i.16b,v$i.16b,v19.16b
sub x17, x17, #32 /* rewind x17 */
ldp q19,q23,[sp], #32
b 1f
.Laes256_${label}_$i:
aesd v$i.16b,v23.16b
aesimc v$i.16b,v$i.16b
ld1 {v19.16b},[x17],16 /* rk[12] */
ld1 {v23.16b},[x17],16 /* rk[13] */
aesd v$i.16b,v19.16b
aesimc v$i.16b,v$i.16b
ld1 {v19.16b},[x17] /* rk[14] */
aesd v$i.16b,v23.16b
eor v$i.16b,v$i.16b,v19.16b
sub x17, x17, #64 /* rewind x17 */
ldp q19,q23,[sp], #32
b 1f
.Laes128_${label}_$i:
___
if ($load_rk10 == 1) {
$code.=<<___;
ld1 {v18.16b},[x9]
___
}
$code.=<<___;
aesd v$i.16b,v17.16b
eor v$i.16b,v$i.16b,v18.16b /* res 0 */
1:
___
}
$code.=<<___;
/*
* Description:
*
* Combined Enc/Auth Primitive = aes128cbc/sha256_hmac
*
* Operations:
*
* out = encrypt-AES128CBC(in)
* return_hash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | out))
*
* Prototype:
* void asm_aescbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
* uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
* CIPH_DIGEST *arg)
*
* Registers used:
*
* asm_aescbc_sha256_hmac(
* csrc, x0 (cipher src address)
* cdst, x1 (cipher dst address)
* clen x2 (cipher length)
* dsrc, x3 (digest src address)
* ddst, x4 (digest dst address)
* dlen, x5 (digest length)
* arg x6 :
* arg->cipher.key (round keys)
* arg->cipher.key_rounds (key rounds)
* arg->cipher.iv (initialization vector)
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
* )
*
* Routine register definitions:
*
* v0 -- v3 -- aes results
* v4 -- v7 -- round consts for sha
* v8 -- v18 -- round keys
* v19 -- v20 -- round keys
* v21 -- ABCD tmp
* v22 -- sha working state ABCD (q22)
* v23 -- sha working state EFGH (q23)
* v24 -- sha state ABCD
* v25 -- sha state EFGH
* v26 -- sha block 0
* v27 -- sha block 1
* v28 -- sha block 2
* v29 -- sha block 3
* v30 -- reserved
* v31 -- reserved
*
* Constraints:
*
* The variable "clen" must be a multiple of 16, otherwise results
* are not defined. For AES partial blocks the user is required
* to pad the input to modulus 16 = 0.
* The variable "dlen" must be a multiple of 8 and greater or equal
* to "clen". This constrain is strictly related to the needs of the IPSec
* ESP packet. Encrypted payload is hashed along with the 8 byte ESP header,
* forming ICV. Speed gain is achieved by doing both things at the same time,
* hence lengths are required to match at least at the cipher level.
*
* Short lengths are not optimized at < 12 AES blocks
*/
.global asm_aescbc_sha256_hmac
.type asm_aescbc_sha256_hmac,%function
.rodata
.align 4
.Lrcon:
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
.word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
.word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
.word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
.word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
.word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
.word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
.word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
.word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
.word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
.word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
.word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
.word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
.word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
.Linit_sha_state:
.word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
.word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
.text
asm_aescbc_sha256_hmac:
AARCH64_VALID_CALL_TARGET
/* protect registers */
stp d8,d9,[sp,#-64]!
/* fetch args */
ldr x7, [x6, #HMAC_IKEYPAD]
/* init ABCD, EFGH. */
ldp q24,q25,[x7]
/* save pointer to o_key_pad partial hash */
ldr x7, [x6, #HMAC_OKEYPAD]
stp d10,d11,[sp,#16]
/* address of sha init state consts */
adrp x12,.Linit_sha_state
add x12,x12,:lo12:.Linit_sha_state
prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
lsr x10,x2,4 /* aes_blocks = len/16 */
stp d12,d13,[sp,#32]
stp d14,d15,[sp,#48]
ldr x9, [x6, #CIPHER_KEY]
ldr x16, [x6, #CIPHER_KEY_ROUNDS]
ldr x6, [x6, #CIPHER_IV]
add x17, x9, #160 /* point to the last 5 rounds keys */
/*
* Init sha state, prefetch, check for small cases.
* Note that the output is prefetched as a load, for the in-place case
*/
prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
cmp x10,12 /* no main loop if <12 */
b.lt .Lenc_short_cases /* branch if < 12 */
/* proceed */
ld1 {v3.16b},[x6] /* get 1st ivec */
/* read first aes block, bump aes_ptr_in */
ld1 {v0.16b},[x0],16
mov x11,x2 /* len -> x11 needed at end */
lsr x12,x11,6 /* total_blocks */
/*
* now we can do the loop prolog, 1st aes sequence of 4 blocks
*/
ld1 {v8.16b},[x9],16 /* rk[0] */
ld1 {v9.16b},[x9],16 /* rk[1] */
eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
ld1 {v10.16b},[x9],16 /* rk[2] */
/* aes xform 0 */
aese v0.16b,v8.16b
aesmc v0.16b,v0.16b
prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
ld1 {v11.16b},[x9],16 /* rk[3] */
aese v0.16b,v9.16b
aesmc v0.16b,v0.16b
prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v12.16b},[x9],16 /* rk[4] */
aese v0.16b,v10.16b
aesmc v0.16b,v0.16b
/* read next aes block, update aes_ptr_in */
ld1 {v1.16b},[x0],16
ld1 {v13.16b},[x9],16 /* rk[5] */
aese v0.16b,v11.16b
aesmc v0.16b,v0.16b
ld1 {v14.16b},[x9],16 /* rk[6] */
aese v0.16b,v12.16b
aesmc v0.16b,v0.16b
ld1 {v15.16b},[x9],16 /* rk[7] */
aese v0.16b,v13.16b
aesmc v0.16b,v0.16b
ld1 {v16.16b},[x9],16 /* rk[8] */
aese v0.16b,v14.16b
aesmc v0.16b,v0.16b
ld1 {v17.16b},[x9],16 /* rk[9] */
aese v0.16b,v15.16b
aesmc v0.16b,v0.16b
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
___
&aes192_aes256_handle(1, "enc_prolog", 0, 1);
$code.=<<___;
eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
/* aes xform 1 */
aese v1.16b,v8.16b
aesmc v1.16b,v1.16b
/* read next aes block, update aes_ptr_in */
ld1 {v2.16b},[x0],16
aese v1.16b,v9.16b
aesmc v1.16b,v1.16b
prfm PLDL1KEEP,[x8,0*64] /* rcon */
aese v1.16b,v10.16b
aesmc v1.16b,v1.16b
aese v1.16b,v11.16b
aesmc v1.16b,v1.16b
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
ld1 {v26.16b},[x3],16
aese v1.16b,v12.16b
aesmc v1.16b,v1.16b
prfm PLDL1KEEP,[x8,2*64] /* rcon */
aese v1.16b,v13.16b
aesmc v1.16b,v1.16b
aese v1.16b,v14.16b
aesmc v1.16b,v1.16b
prfm PLDL1KEEP,[x8,4*64] /* rcon */
aese v1.16b,v15.16b
aesmc v1.16b,v1.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
prfm PLDL1KEEP,[x8,6*64] /* rcon */
___
&aes192_aes256_handle(0, "enc_prolog", 1, 0);
$code.=<<___;
prfm PLDL1KEEP,[x8,8*64] /* rcon */
eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */
/* aes xform 2 */
aese v2.16b,v8.16b
aesmc v2.16b,v2.16b
/* read next aes block, update aes_ptr_in */
ld1 {v3.16b},[x0],16
aese v2.16b,v9.16b
aesmc v2.16b,v2.16b
aese v2.16b,v10.16b
aesmc v2.16b,v2.16b
prfm PLDL1KEEP,[x8,10*64] /* rcon */
aese v2.16b,v11.16b
aesmc v2.16b,v2.16b
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
ld1 {v27.16b},[x3],16
aese v2.16b,v12.16b
aesmc v2.16b,v2.16b
prfm PLDL1KEEP,[x8,12*64] /* rcon */
aese v2.16b,v13.16b
aesmc v2.16b,v2.16b
aese v2.16b,v14.16b
aesmc v2.16b,v2.16b
prfm PLDL1KEEP,[x8,14*64] /* rcon */
aese v2.16b,v15.16b
aesmc v2.16b,v2.16b
aese v2.16b,v16.16b
aesmc v2.16b,v2.16b
___
&aes192_aes256_handle(0, "enc_prolog", 2, 0);
$code.=<<___;
eor v3.16b,v3.16b,v2.16b /* xor w/ivec (modeop) */
/* aes xform 3 */
aese v3.16b,v8.16b
aesmc v3.16b,v3.16b
aese v3.16b,v9.16b
aesmc v3.16b,v3.16b
aese v3.16b,v10.16b
aesmc v3.16b,v3.16b
aese v3.16b,v11.16b
aesmc v3.16b,v3.16b
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
ld1 {v28.16b},[x3],16
aese v3.16b,v12.16b
aesmc v3.16b,v3.16b
aese v3.16b,v13.16b
aesmc v3.16b,v3.16b
aese v3.16b,v14.16b
aesmc v3.16b,v3.16b
aese v3.16b,v15.16b
aesmc v3.16b,v3.16b
aese v3.16b,v16.16b
aesmc v3.16b,v3.16b
sub x15,x12,1 /* main_blocks = total_blocks - 1 */
and x13,x10,3 /* aes_blocks_left */
___
&aes192_aes256_handle(0, "enc_prolog", 3, 0);
$code.=<<___;
/*
* Note, aes_blocks_left := number after the main (sha)
* block is done. Can be 0
*/
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
ld1 {v29.16b},[x3],16
/* get outstanding bytes of the digest */
sub x12,x5,x2
/* subtract loaded bytes */
sub x5,x5,64
/*
* main combined loop CBC
*/
.Lenc_main_loop:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/*
* Because both mov, rev32 and eor have a busy cycle,this takes longer
* than it looks. That's OK since there are 6 cycles before we can use
* the load anyway; so this goes as fast as it can without SW
* pipelining(too complicated given the code size)
*/
rev32 v26.16b,v26.16b
/* next aes block, update aes_ptr_in */
ld1 {v0.16b},[x0],16
mov v22.16b,v24.16b /* working ABCD <- ABCD */
prfm PLDL1KEEP,[x9,64] /* pref next lead_ptr */
rev32 v27.16b,v27.16b
/* pref next aes_ptr_out, streaming */
prfm PLDL1KEEP,[x1,64]
mov v23.16b,v25.16b /* working EFGH <- EFGH */
ld1 {v4.16b},[x8],16 /* key0 */
eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
ld1 {v5.16b},[x8],16 /* key1 */
/*
* aes xform 0, sha quad 0
*/
aese v0.16b,v8.16b
aesmc v0.16b,v0.16b
ld1 {v6.16b},[x8],16 /* key2 */
rev32 v28.16b,v28.16b
ld1 {v7.16b},[x8],16 /* key3 */
/* read next aes block, update aes_ptr_in */
ld1 {v1.16b},[x0],16
aese v0.16b,v9.16b
aesmc v0.16b,v0.16b
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
aese v0.16b,v10.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
aese v0.16b,v11.16b
aesmc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
/* no place to get rid of this stall */
rev32 v29.16b,v29.16b
sha256h2 q23, q21, v4.4s
aese v0.16b,v12.16b
aesmc v0.16b,v0.16b
sha256su1 v26.4s,v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aese v0.16b,v13.16b
aesmc v0.16b,v0.16b
sha256h q22, q23, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v5.4s
aese v0.16b,v14.16b
aesmc v0.16b,v0.16b
ld1 {v5.16b},[x8],16 /* key5 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
aese v0.16b,v15.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v29.4s,v26.4s
sha256h q22, q23, v7.4s
___
&aes192_aes256_handle(1, "enc_mainloop", 0, 0);
$code.=<<___;
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
/* aes xform 1, sha quad 1 */
sha256su0 v26.4s,v27.4s
eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */
ld1 {v7.16b},[x8],16 /* key7 */
mov v21.16b, v22.16b /* copy abcd */
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
aese v1.16b,v8.16b
aesmc v1.16b,v1.16b
sha256h q22, q23, v4.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256h2 q23, q21, v4.4s
sha256su1 v26.4s,v28.4s,v29.4s
aese v1.16b,v9.16b
aesmc v1.16b,v1.16b
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v1.16b,v10.16b
aesmc v1.16b,v1.16b
/* read next aes block, update aes_ptr_in */
ld1 {v2.16b},[x0],16
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v4.16b},[x8],16 /* key4 */
aese v1.16b,v11.16b
aesmc v1.16b,v1.16b
ld1 {v5.16b},[x8],16 /* key5 */
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v28.4s,v29.4s
sha256h q22, q23, v6.4s
aese v1.16b,v12.16b
aesmc v1.16b,v1.16b
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aese v1.16b,v13.16b
aesmc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aese v1.16b,v14.16b
aesmc v1.16b,v1.16b
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
aese v1.16b,v15.16b
aesmc v1.16b,v1.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
___
&aes192_aes256_handle(0, "enc_mainloop", 1, 0);
$code.=<<___;
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
/* mode op 2 */
eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */
/* aes xform 2, sha quad 2 */
sha256su0 v26.4s,v27.4s
aese v2.16b,v8.16b
aesmc v2.16b,v2.16b
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aese v2.16b,v9.16b
aesmc v2.16b,v2.16b
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aese v2.16b,v10.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v2.16b,v11.16b
aesmc v2.16b,v2.16b
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su0 v28.4s,v29.4s
aese v2.16b,v12.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v2.16b,v13.16b
aesmc v2.16b,v2.16b
sha256su1 v28.4s,v26.4s,v27.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v29.4s,v26.4s
/* read next aes block, update aes_ptr_in */
ld1 {v3.16b},[x0],16
aese v2.16b,v14.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aese v2.16b,v15.16b
aesmc v2.16b,v2.16b
sha256su1 v29.4s,v27.4s,v28.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
aese v2.16b,v16.16b
aesmc v2.16b,v2.16b
___
&aes192_aes256_handle(0, "enc_mainloop", 2, 0);
$code.=<<___;
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
/* mode op 3 */
eor v3.16b,v3.16b,v2.16b /* xor w/prev value */
/* aes xform 3, sha quad 3 (hash only) */
aese v3.16b,v8.16b
aesmc v3.16b,v3.16b
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
aese v3.16b,v9.16b
aesmc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aese v3.16b,v10.16b
aesmc v3.16b,v3.16b
aese v3.16b,v11.16b
aesmc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v3.16b,v12.16b
aesmc v3.16b,v3.16b
aese v3.16b,v13.16b
aesmc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v3.16b,v14.16b
aesmc v3.16b,v3.16b
sub x15,x15,1 /* dec block count */
aese v3.16b,v15.16b
aesmc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aese v3.16b,v16.16b
aesmc v3.16b,v3.16b
___
&aes192_aes256_handle(0, "enc_mainloop", 3, 0);
$code.=<<___;
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
ldp q26,q27,[x3],32
ldp q28,q29,[x3],32
sub x5,x5,64
cbnz x15,.Lenc_main_loop /* loop if more to do */
mov w15,0x80 /* that's the 1 of the pad */
/*
* epilog, process remaining aes blocks and b-2 sha block
* do this inline (no loop) to overlap with the sha part
* note there are 0-3 aes blocks left.
*/
rev32 v26.16b,v26.16b /* fix endian w0 */
rev32 v27.16b,v27.16b /* fix endian w1 */
rev32 v28.16b,v28.16b /* fix endian w2 */
rev32 v29.16b,v29.16b /* fix endian w3 */
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
cbz x13, .Lbm2fromQ0 /* skip if none left */
/*
* mode op 0
* read next aes block, update aes_ptr_in
*/
ld1 {v0.16b},[x0],16
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
ld1 {v6.16b},[x8],16 /* key2 */
ld1 {v7.16b},[x8],16 /* key3 */
eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
/* aes xform 0, sha quad 0 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
aese v0.16b,v8.16b
aesmc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su0 v26.4s,v27.4s
aese v0.16b,v9.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aese v0.16b,v10.16b
aesmc v0.16b,v0.16b
sha256su1 v26.4s,v28.4s,v29.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su0 v27.4s,v28.4s
aese v0.16b,v11.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v0.16b,v12.16b
aesmc v0.16b,v0.16b
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su0 v28.4s,v29.4s
aese v0.16b,v13.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v0.16b,v14.16b
aesmc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aese v0.16b,v15.16b
aesmc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
___
&aes192_aes256_handle(1, "enc_epilog", 0, 0);
$code.=<<___;
subs x14,x13,1 /* local copy of aes_blocks_left */
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
/* if aes_blocks_left_count == 0 */
beq .Lbm2fromQ1
/*
* mode op 1
* read next aes block, update aes_ptr_in
*/
ld1 {v1.16b},[x0],16
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
eor v1.16b,v1.16b,v0.16b /* xor w/prev value */
/* aes xform 1, sha quad 1 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
aese v1.16b,v8.16b
aesmc v1.16b,v1.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su0 v26.4s,v27.4s
aese v1.16b,v9.16b
aesmc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aese v1.16b,v10.16b
aesmc v1.16b,v1.16b
sha256su1 v26.4s,v28.4s,v29.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su0 v27.4s,v28.4s
aese v1.16b,v11.16b
aesmc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v1.16b,v12.16b
aesmc v1.16b,v1.16b
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su0 v28.4s,v29.4s
aese v1.16b,v13.16b
aesmc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v1.16b,v14.16b
aesmc v1.16b,v1.16b
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aese v1.16b,v15.16b
aesmc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
sha256su1 v29.4s,v27.4s,v28.4s
___
&aes192_aes256_handle(1, "enc_epilog", 1, 0);
$code.=<<___;
subs x14,x14,1 /* dec counter */
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
/* if aes_blocks_left_count == 0 */
beq .Lbm2fromQ2
/*
* mode op 2
* read next aes block, update aes_ptr_in
*/
ld1 {v2.16b},[x0],16
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
eor v2.16b,v2.16b,v1.16b /* xor w/prev value */
/* aes xform 2, sha quad 2 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
aese v2.16b,v8.16b
aesmc v2.16b,v2.16b
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su0 v26.4s,v27.4s
aese v2.16b,v9.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aese v2.16b,v10.16b
aesmc v2.16b,v2.16b
sha256su1 v26.4s,v28.4s,v29.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su0 v27.4s,v28.4s
aese v2.16b,v11.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aese v2.16b,v12.16b
aesmc v2.16b,v2.16b
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su0 v28.4s,v29.4s
aese v2.16b,v13.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aese v2.16b,v14.16b
aesmc v2.16b,v2.16b
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aese v2.16b,v15.16b
aesmc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
aese v2.16b,v16.16b
aesmc v2.16b,v2.16b
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
___
&aes192_aes256_handle(1, "enc_epilog", 2, 0);
$code.=<<___;
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
/* join common code at Quad 3 */
b .Lbm2fromQ3
/*
* Now there is the b-2 sha block before the final one. Execution takes over
* in the appropriate part of this depending on how many aes blocks were left.
* If there were none, the whole thing is executed.
*/
/* quad 0 */
.Lbm2fromQ0:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
.Lbm2fromQ1:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
.Lbm2fromQ2:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
.Lbm2fromQ3:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
eor v26.16b,v26.16b,v26.16b /* zero reg */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
eor v27.16b,v27.16b,v27.16b /* zero reg */
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
eor v28.16b,v28.16b,v28.16b /* zero reg */
sha256h2 q23, q21, v7.4s
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
/* Process remaining 0-3 AES blocks here */
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
cbz x13,.Lpost_long_Q0
/* 1st remaining AES block */
ld1 {v26.16b},[x3],16
sub x5,x5,16
rev32 v26.16b,v26.16b
subs x14,x13,1
b.eq .Lpost_long_Q1
/* 2nd remaining AES block */
ld1 {v27.16b},[x3],16
sub x5,x5,16
rev32 v27.16b,v27.16b
subs x14,x14,1
b.eq .Lpost_long_Q2
/* 3rd remaining AES block */
ld1 {v28.16b},[x3],16
sub x5,x5,16
rev32 v28.16b,v28.16b
/* Allow for filling this sha256 block with the remaining digest src */
b .Lpost_long_Q3
/*
* Process remaining 8B blocks of the digest
*/
.Lpost_long_Q0:
/* blk 0,1 */
/* assume final block */
mov v26.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v26 value (0x80) */
mov v26.d[0],x2
/* assume this was final block */
mov v26.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v26.d[1],x2
.Lpost_long_Q1:
/* blk 2,3 */
/* assume this is final block */
mov v27.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v27 value (0x80) */
mov v27.d[0],x2
/* assume this was final block */
mov v27.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v27.d[1],x2
.Lpost_long_Q2:
/* blk 4,5 */
/* assume this was final block */
mov v28.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v28 value (0x80) */
mov v28.d[0],x2
/* assume this was final block */
mov v28.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v28.d[1],x2
.Lpost_long_Q3:
/* blk 6,7 */
/* assume this was final block */
mov v29.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_long_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v29 value (0x80) */
mov v29.d[0],x2
/* assume this was final block */
mov v29.b[11],w15
/*
* Outstanding 8B blocks left.
* Since there has to be another sha block with padding,
* we need to calculate hash without padding here.
*/
cbz x5,1f
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
rev32 x2,x2
/*
* Don't decrease x5 here.
* Use it to indicate necessity of constructing "1" padding at the end.
*/
mov v29.d[1],x2
/*
* That is enough of blocks, we allow up to 64 bytes in total.
* Now we have the sha256 to do for these 4 16B blocks
*/
1:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
mov v21.16b, v22.16b /* copy abcd */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
/* this was final block */
cbz x5,.Lpost_long_loop
subs x5,x5,8
/* loop if hash is not finished */
b.ne .Lpost_long_Q0
/* set "1" of the padding if this was a final block */
mov v26.b[3],w15
.Lpost_long_loop:
/* Add outstanding bytes of digest source */
add x11,x11,x12
/* Add one SHA-256 block since hash is calculated including i_key_pad */
add x11,x11, #64
lsr x12,x11,32 /* len_hi */
and x13,x11,0xffffffff /* len_lo */
lsl x12,x12,3 /* len_hi in bits */
lsl x13,x13,3 /* len_lo in bits */
mov v29.s[3],w13 /* len_lo */
mov v29.s[2],w12 /* len_hi */
/*
* do last sha of pad block
*/
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/* quad 0 */
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
ldp q6,q7,[x8],32 /* key6,key7 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
mov v21.16b, v22.16b /* copy abcd */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h q22, q23, v4.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
/* Calculate final HMAC */
eor v28.16b, v28.16b, v28.16b
eor v29.16b, v29.16b, v29.16b
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/* load o_key_pad partial hash */
ldp q24,q25,[x7]
/* Set padding 1 to the first reg */
mov w11, #0x80 /* that's the 1 of the pad */
mov v28.b[3], w11
/* size of o_key_pad + inner hash */
mov x11, #64+32
lsl x11, x11, 3
/* move length to the end of the block */
mov v29.s[3], w11
ldp q4,q5,[x8],32 /* key0,key1 */
lsr x11, x11, 32
mov v29.s[2], w11 /* and the higher part */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ldp q6,q7,[x8],32 /* key6,key7 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
mov v21.16b, v22.16b /* copy abcd */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h q22, q23, v4.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ldp d14,d15,[sp,#48]
ldp d8,d9,[sp],#64
mov x0, xzr
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
rev32 v24.16b, v24.16b
rev32 v25.16b, v25.16b
stp q24,q25,[x4] /* save them both */
ret
/*
* These are the short cases (less efficient), here used for 1-11 aes blocks.
* x10 = aes_blocks
*/
.Lenc_short_cases:
ld1 {v3.16b},[x6] /* get ivec */
ldp q8,q9,[x9],32 /* rk[0-1] */
eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
ldp q10,q11,[x9],32 /* rk[2-3] */
eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
ldp q12,q13,[x9],32 /* rk[3-4] */
eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
ldp q14,q15,[x9],32 /* rk[5-6] */
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
ldp q16,q17,[x9],32 /* rk[7-8] */
mov w15,0x80 /* sha padding word */
lsl x11,x10,4 /* len = aes_blocks*16 */
ld1 {v18.16b},[x9] /* rk[9] */
/* get outstanding bytes of the digest */
sub x12,x5,x2
/*
* the idea in the short loop (at least 1) is to break out with the padding
* already in place excepting the final word.
*/
.Lenc_short_loop:
adrp x8,.Lrcon /* rcon */
add x8,x8,:lo12:.Lrcon
/* read next aes block, update aes_ptr_in */
ld1 {v0.16b},[x0],16
eor v0.16b,v0.16b,v3.16b /* xor w/prev value */
/* aes xform 0 */
aese v0.16b,v8.16b
aesmc v0.16b,v0.16b
aese v0.16b,v9.16b
aesmc v0.16b,v0.16b
aese v0.16b,v10.16b
aesmc v0.16b,v0.16b
aese v0.16b,v11.16b
aesmc v0.16b,v0.16b
aese v0.16b,v12.16b
aesmc v0.16b,v0.16b
aese v0.16b,v13.16b
aesmc v0.16b,v0.16b
aese v0.16b,v14.16b
aesmc v0.16b,v0.16b
aese v0.16b,v15.16b
aesmc v0.16b,v0.16b
aese v0.16b,v16.16b
aesmc v0.16b,v0.16b
___
&aes192_aes256_handle(1, "enc_short", 0, 1);
$code.=<<___;
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
/* load next 16 bytes for SHA-256 */
ld1 {v26.16b},[x3],16
/* dec number of bytes of the hash input */
sub x5,x5,16
rev32 v26.16b,v26.16b /* load res to sha 0, endian swap */
sub x10,x10,1 /* dec num_blocks */
cbz x10,.Lpost_short_Q1 /* break if no more */
/* read next aes block, update aes_ptr_in */
ld1 {v1.16b},[x0],16
eor v1.16b,v1.16b,v0.16b /* xor w/prev value */
/* aes xform 1 */
aese v1.16b,v8.16b
aesmc v1.16b,v1.16b
aese v1.16b,v9.16b
aesmc v1.16b,v1.16b
aese v1.16b,v10.16b
aesmc v1.16b,v1.16b
aese v1.16b,v11.16b
aesmc v1.16b,v1.16b
aese v1.16b,v12.16b
aesmc v1.16b,v1.16b
aese v1.16b,v13.16b
aesmc v1.16b,v1.16b
aese v1.16b,v14.16b
aesmc v1.16b,v1.16b
aese v1.16b,v15.16b
aesmc v1.16b,v1.16b
aese v1.16b,v16.16b
aesmc v1.16b,v1.16b
___
&aes192_aes256_handle(1, "enc_short", 1, 0);
$code.=<<___;
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
/* load next 16 bytes for SHA-256 */
ld1 {v27.16b},[x3],16
/* dec number of bytes of the hash input */
sub x5,x5,16
rev32 v27.16b,v27.16b /* load res to sha 0, endian swap */
sub x10,x10,1 /* dec num_blocks */
cbz x10,.Lpost_short_Q2 /* break if no more */
/* read next aes block, update aes_ptr_in */
ld1 {v2.16b},[x0],16
eor v2.16b,v2.16b,v1.16b /* xor w/prev value */
/* aes xform 2 */
aese v2.16b,v8.16b
aesmc v2.16b,v2.16b
aese v2.16b,v9.16b
aesmc v2.16b,v2.16b
aese v2.16b,v10.16b
aesmc v2.16b,v2.16b
aese v2.16b,v11.16b
aesmc v2.16b,v2.16b
aese v2.16b,v12.16b
aesmc v2.16b,v2.16b
aese v2.16b,v13.16b
aesmc v2.16b,v2.16b
aese v2.16b,v14.16b
aesmc v2.16b,v2.16b
aese v2.16b,v15.16b
aesmc v2.16b,v2.16b
aese v2.16b,v16.16b
aesmc v2.16b,v2.16b
___
&aes192_aes256_handle(1, "enc_short", 2, 0);
$code.=<<___;
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
/* load next 16 bytes for SHA-256 */
ld1 {v28.16b},[x3],16
/* dec number of bytes of the hash input */
sub x5,x5,16
rev32 v28.16b,v28.16b /* load res to sha 0, endian swap */
sub x10,x10,1 /* dec num_blocks */
cbz x10,.Lpost_short_Q3 /* break if no more */
/* read next aes block, update aes_ptr_in */
ld1 {v3.16b},[x0],16
eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
/* aes xform 3 */
aese v3.16b,v8.16b
aesmc v3.16b,v3.16b
aese v3.16b,v9.16b
aesmc v3.16b,v3.16b
aese v3.16b,v10.16b
aesmc v3.16b,v3.16b
aese v3.16b,v11.16b
aesmc v3.16b,v3.16b
aese v3.16b,v12.16b
aesmc v3.16b,v3.16b
aese v3.16b,v13.16b
aesmc v3.16b,v3.16b
aese v3.16b,v14.16b
aesmc v3.16b,v3.16b
aese v3.16b,v15.16b
aesmc v3.16b,v3.16b
aese v3.16b,v16.16b
aesmc v3.16b,v3.16b
___
&aes192_aes256_handle(1, "enc_short", 3, 0);
$code.=<<___;
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
/* load next 16 bytes for SHA-256 */
ld1 {v29.16b},[x3],16
/* dec number of bytes of the hash input */
sub x5,x5,16
/* load res to sha 0, endian swap */
rev32 v29.16b,v29.16b
/*
* now we have the sha256 to do for these 4 aes blocks
*/
/* quad 0 */
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
mov v21.16b, v22.16b /* copy abcd */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
sub x10,x10,1 /* dec num_blocks */
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
cbnz x10,.Lenc_short_loop /* keep looping if more */
.Lpost_short_Q0:
/* assume this was final block */
mov v26.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v26 value (0x80) */
mov v26.d[0],x2
/* assume this was final block */
mov v26.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v26.d[1],x2
.Lpost_short_Q1:
/* zero out vectors */
eor v27.16b,v27.16b,v27.16b
eor v28.16b,v28.16b,v28.16b
eor v29.16b,v29.16b,v29.16b
/* assume this is final block */
mov v27.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v27 value (0x80) */
mov v27.d[0],x2
/* assume this was final block */
mov v27.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v27.d[1],x2
.Lpost_short_Q2:
/* zero out vectors (repeated if came from Q0) */
eor v28.16b,v28.16b,v28.16b
eor v29.16b,v29.16b,v29.16b
/* assume this was final block */
mov v28.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v28 value (0x80) */
mov v28.d[0],x2
/* assume this was final block */
mov v28.b[11],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
mov v28.d[1],x2
.Lpost_short_Q3:
/* zero out vector (repeated if came from Q1) */
eor v29.16b,v29.16b,v29.16b
/* assume this was final block */
mov v29.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_short_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v29 value (0x80) */
mov v29.d[0],x2
/* assume this was final block */
mov v29.b[11],w15
/* outstanding 8B blocks left */
cbz x5,1f
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
rev32 x2,x2
mov v29.d[1],x2
/*
* That is enough of blocks, we allow up to 64 bytes in total.
* Now we have the sha256 to do for these 4 16B blocks
*/
1:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
mov v21.16b, v22.16b /* copy abcd */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
/* this was final block */
cbz x5,.Lpost_short_loop
subs x5,x5,8
/* loop if hash is not finished */
b.ne .Lpost_short_Q0
/* set "1" of the padding if this was a final block */
mov v26.b[3],w15
/*
* there are between 0 and 3 aes blocks in the final sha256 blocks
*/
.Lpost_short_loop:
/* Add outstanding bytes of digest source */
add x11,x11,x12
/* Add one SHA-256 block since hash is calculated including i_key_pad */
add x11,x11, #64
lsr x12,x11,32 /* len_hi */
and x13,x11,0xffffffff /* len_lo */
lsl x12,x12,3 /* len_hi in bits */
lsl x13,x13,3 /* len_lo in bits */
mov v29.s[3],w13 /* len_lo */
mov v29.s[2],w12 /* len_hi */
/* do final block */
/* base address for sha round consts */
adrp x8,.Lrcon /* top of rcon */
add x8,x8,:lo12:.Lrcon
/* quad 0 */
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
ldp q6,q7,[x8],32 /* key6,key7 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
mov v21.16b, v22.16b /* copy abcd */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h q22, q23, v4.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
/* Calculate final HMAC */
eor v28.16b, v28.16b, v28.16b
eor v29.16b, v29.16b, v29.16b
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/* load o_key_pad partial hash */
ldp q24,q25,[x7]
/* Set padding 1 to the first reg */
mov w11, #0x80 /* that's the 1 of the pad */
mov v28.b[3], w11
/* size of o_key_pad + inner hash */
mov x11, #64+32
lsl x11, x11, 3
/* move length to the end of the block */
mov v29.s[3], w11
ldp q4,q5,[x8],32 /* key0,key1 */
lsr x11, x11, 32
mov v29.s[2], w11 /* and the higher part */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ldp q6,q7,[x8],32 /* key6,key7 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
mov v21.16b, v22.16b /* copy abcd */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h q22, q23, v4.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
ldp d10,d11,[sp,#16]
ldp d12,d13,[sp,#32]
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ldp d14,d15,[sp,#48]
ldp d8,d9,[sp],#64
mov x0, xzr
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
rev32 v24.16b, v24.16b
rev32 v25.16b, v25.16b
stp q24,q25,[x4] /* save them both */
ret
.size asm_aescbc_sha256_hmac, .-asm_aescbc_sha256_hmac
/*
* Description:
*
* Combined Auth/Dec Primitive = sha256_hmac/aes128cbc
*
* Operations:
*
* out = decrypt-AES128CBC(in)
* return_ash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | in))
*
* Prototype:
*
* void asm_sha256_hmac_aescbc_dec(uint8_t *csrc, uint8_t *cdst, uint64_t clen,
* uint8_t *dsrc, uint8_t *ddst, uint64_t dlen,
* CIPH_DIGEST *arg)
*
* Registers used:
*
* asm_sha256_hmac_aescbc_dec(
* csrc, x0 (cipher src address)
* cdst, x1 (cipher dst address)
* clen x2 (cipher length)
* dsrc, x3 (digest src address)
* ddst, x4 (digest dst address)
* dlen, x5 (digest length)
* arg x6:
* arg->cipher.key (round keys)
* arg->cipher.key_rounds (key rounds)
* arg->cipher.iv (initialization vector)
* arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
* arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
* )
*
* Routine register definitions:
*
* v0 - v3 -- aes results
* v4 - v7 -- round consts for sha
* v8 - v18 -- round keys
* v19 - v20 -- round keys
* v21 -- ABCD tmp
* v22 -- sha working state ABCD (q22)
* v23 -- sha working state EFGH (q23)
* v24 -- sha state ABCD
* v25 -- sha state EFGH
* v26 -- sha block 0
* v27 -- sha block 1
* v28 -- sha block 2
* v29 -- sha block 3
* v30 -- reserved
* v31 -- reserved
*
*
* Constraints:
*
* The variable "clen" must be a multiple of 16, otherwise results are not
* defined For AES partial blocks the user is required to pad the input to
* modulus 16 = 0.
*
* The variable "dlen" must be a multiple of 8 and greater or equal to "clen".
* The maximum difference between "dlen" and "clen" cannot exceed 64 bytes.
* This constrain is strictly related to the needs of the IPSec ESP packet.
* Short lengths are less optimized at < 16 AES blocks, however they are
* somewhat optimized, and more so than the enc/auth versions.
*/
.global asm_sha256_hmac_aescbc_dec
.type asm_sha256_hmac_aescbc_dec,%function
asm_sha256_hmac_aescbc_dec:
AARCH64_VALID_CALL_TARGET
/* protect registers */
stp d8,d9,[sp, #-80]!
/* fetch args */
ldr x7, [x6, #HMAC_IKEYPAD]
/* init ABCD, EFGH */
ldp q24,q25,[x7]
/* save pointer to o_key_pad partial hash */
ldr x7, [x6, #HMAC_OKEYPAD]
stp d10,d11,[sp,#16]
prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
stp d12,d13,[sp,#32]
prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
lsr x10,x2,4 /* aes_blocks = len/16 */
stp d14,d15,[sp,#48]
/* address of sha init state consts */
adrp x12,.Linit_sha_state
add x12,x12,:lo12:.Linit_sha_state
stp x19,x20,[sp,#64]
ldr x9, [x6, #CIPHER_KEY]
ldr x16, [x6, #CIPHER_KEY_ROUNDS]
ldr x6, [x6, #CIPHER_IV]
add x17, x9, #160 /* point to the last 5 rounds keys */
/*
* Init sha state, prefetch, check for small cases.
* Note that the output is prefetched as a load, for the in-place case.
*/
cmp x10,16 /* no main loop if <16 */
blt .Ldec_short_cases /* branch if < 12 */
/* get outstanding bytes of the digest */
sub x20,x5,x2
mov x11,x2 /* len -> x11 needed at end */
ld1 {v30.16b},[x6] /* get 1st ivec */
lsr x12,x11,6 /* total_blocks (sha) */
ldp q26,q27,[x3],32
rev32 v26.16b,v26.16b /* endian swap w0 */
rev32 v27.16b,v27.16b /* endian swap w1 */
ldp q28,q29,[x3],32
rev32 v28.16b,v28.16b /* endian swap w2 */
rev32 v29.16b,v29.16b /* endian swap w3 */
/* subtract loaded bytes */
sub x5,x5,64
/*
* now we can do the loop prolog, 1st sha256 block
*/
prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/*
* do the first sha256 block on the plaintext
*/
mov v22.16b,v24.16b /* init working ABCD */
mov v23.16b,v25.16b /* init working EFGH */
/* quad 0 */
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
ld1 {v6.16b},[x8],16 /* key2 */
ld1 {v7.16b},[x8],16 /* key3 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v8.16b},[x9],16 /* rk[0] */
sha256h2 q23, q21, v4.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v9.16b},[x9],16 /* rk[1] */
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v10.16b},[x9],16 /* rk[2] */
sha256h2 q23, q21, v5.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
ld1 {v11.16b},[x9],16 /* rk[3] */
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256h2 q23, q21, v7.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v12.16b},[x9],16 /* rk[4] */
sha256h2 q23, q21, v4.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v13.16b},[x9],16 /* rk[5] */
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v14.16b},[x9],16 /* rk[6] */
sha256h2 q23, q21, v5.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
ld1 {v15.16b},[x9],16 /* rk[7] */
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256h2 q23, q21, v7.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v16.16b},[x9],16 /* rk[8] */
sha256h2 q23, q21, v4.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v17.16b},[x9],16 /* rk[9] */
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v18.16b},[x9],16 /* rk[10] */
sha256h2 q23, q21, v5.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256h2 q23, q21, v7.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v4.4s
ld1 {v26.16b},[x3],16 /* next w0 */
ld1 {v27.16b},[x3],16 /* next w1 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v5.4s
ld1 {v28.16b},[x3],16 /* next w2 */
ld1 {v29.16b},[x3],16 /* next w3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
/* subtract loaded bytes */
sub x5,x5,64
/*
* aes_blocks_left := number after the main (sha) block is done.
* can be 0 note we account for the extra unwind in main_blocks
*/
sub x15,x12,2 /* main_blocks=total_blocks-5 */
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
and x13,x10,3 /* aes_blocks_left */
ld1 {v0.16b},[x0] /* next aes block, no update */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
add x9,x0,128 /* lead_ptr = *in */
/* next aes block, update aes_ptr_in */
ld1 {v31.16b},[x0],16
/* indicate AES blocks to write back */
mov x19,xzr
/*
* main combined loop CBC, can be used by auth/enc version
*/
.Ldec_main_loop:
/*
* Because both mov, rev32 and eor have a busy cycle, this takes longer
* than it looks.
*/
rev32 v26.16b,v26.16b /* fix endian w0 */
mov v22.16b,v24.16b /* working ABCD <- ABCD */
prfm PLDL1KEEP,[x9,64] /* pref next lead_ptr */
rev32 v27.16b,v27.16b /* fix endian w1 */
/* pref next aes_ptr_out, streaming */
prfm PLDL1KEEP,[x1,64]
mov v23.16b,v25.16b /* working EFGH <- EFGH */
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/*
* aes xform 0, sha quad 0
*/
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
ld1 {v4.16b},[x8],16 /* key0 */
rev32 v28.16b,v28.16b /* fix endian w2 */
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
ld1 {v5.16b},[x8],16 /* key1 */
sha256su0 v26.4s,v27.4s
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
ld1 {v6.16b},[x8],16 /* key2 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v7.16b},[x8],16 /* key3 */
rev32 v29.16b,v29.16b /* fix endian w3 */
/* read next aes block, no update */
ld1 {v1.16b},[x0]
sha256h2 q23, q21, v4.4s
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
sha256su1 v26.4s,v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
sha256h q22, q23, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v5.4s
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
ld1 {v5.16b},[x8],16 /* key5 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v29.4s,v26.4s
sha256h q22, q23, v7.4s
___
&aes192_aes256_dec_handle(1,"dec_mainloop",0,0);
$code.=<<___;
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ld1 {v6.16b},[x8],16 /* key6 */
eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
/* get next aes block, with update */
ld1 {v30.16b},[x0],16
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
/* aes xform 1, sha quad 1 */
sha256su0 v26.4s,v27.4s
ld1 {v7.16b},[x8],16 /* key7 */
mov v21.16b, v22.16b /* copy abcd */
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
aesd v1.16b,v8.16b
aesimc v1.16b,v1.16b
sha256h q22, q23, v4.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256h2 q23, q21, v4.4s
sha256su1 v26.4s,v28.4s,v29.4s
aesd v1.16b,v9.16b
aesimc v1.16b,v1.16b
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v1.16b,v10.16b
aesimc v1.16b,v1.16b
/* read next aes block, no update */
ld1 {v2.16b},[x0]
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v4.16b},[x8],16 /* key4 */
aesd v1.16b,v11.16b
aesimc v1.16b,v1.16b
ld1 {v5.16b},[x8],16 /* key5 */
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v28.4s,v29.4s
sha256h q22, q23, v6.4s
aesd v1.16b,v12.16b
aesimc v1.16b,v1.16b
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aesd v1.16b,v13.16b
aesimc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v1.16b,v14.16b
aesimc v1.16b,v1.16b
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
aesd v1.16b,v15.16b
aesimc v1.16b,v1.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
___
&aes192_aes256_dec_handle(1,"dec_mainloop",1,0);
$code.=<<___;
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
/* read next aes block, update aes_ptr_in */
ld1 {v31.16b},[x0],16
/* aes xform 2, sha quad 2 */
sha256su0 v26.4s,v27.4s
aesd v2.16b,v8.16b
aesimc v2.16b,v2.16b
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aesd v2.16b,v9.16b
aesimc v2.16b,v2.16b
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aesd v2.16b,v10.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v2.16b,v11.16b
aesimc v2.16b,v2.16b
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su0 v28.4s,v29.4s
aesd v2.16b,v12.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v2.16b,v13.16b
aesimc v2.16b,v2.16b
sha256su1 v28.4s,v26.4s,v27.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v29.4s,v26.4s
/* read next aes block, no update */
ld1 {v3.16b},[x0]
aesd v2.16b,v14.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v2.16b,v15.16b
aesimc v2.16b,v2.16b
sha256su1 v29.4s,v27.4s,v28.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
aesd v2.16b,v16.16b
aesimc v2.16b,v2.16b
___
&aes192_aes256_dec_handle(1,"dec_mainloop",2,0);
$code.=<<___;
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
/* read next aes block, update aes_ptr_in */
ld1 {v30.16b},[x0],16
/* aes xform 3, sha quad 3 (hash only) */
aesd v3.16b,v8.16b
aesimc v3.16b,v3.16b
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
aesd v3.16b,v9.16b
aesimc v3.16b,v3.16b
ld1 {v26.16b},[x3],16 /* next w0 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aesd v3.16b,v10.16b
aesimc v3.16b,v3.16b
ld1 {v27.16b},[x3],16 /* next w1 */
aesd v3.16b,v11.16b
aesimc v3.16b,v3.16b
ld1 {v28.16b},[x3],16 /* next w2 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v3.16b,v12.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v13.16b
aesimc v3.16b,v3.16b
ld1 {v29.16b},[x3],16 /* next w3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v3.16b,v14.16b
aesimc v3.16b,v3.16b
sub x15,x15,1 /* dec block count */
aesd v3.16b,v15.16b
aesimc v3.16b,v3.16b
ld1 {v0.16b},[x0] /* next aes block, no update */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v3.16b,v16.16b
aesimc v3.16b,v3.16b
___
&aes192_aes256_dec_handle(1,"dec_mainloop",3,0);
$code.=<<___;
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
/* next aes block, update aes_ptr_in */
ld1 {v31.16b},[x0],16
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
/* subtract loaded bytes */
sub x5,x5,64
cbnz x15,.Ldec_main_loop /* loop if more to do */
/*
* Now the loop epilog. Since the reads for sha have already been done
* in advance, we have to have an extra unwind.
* This is why the test for the short cases is 16 and not 12.
*
* The unwind, which is just the main loop without the tests or final reads.
*/
rev32 v26.16b,v26.16b /* fix endian w0 */
mov v22.16b,v24.16b /* working ABCD <- ABCD */
rev32 v27.16b,v27.16b /* fix endian w1 */
/* pref next aes_ptr_out, streaming */
prfm PLDL1KEEP,[x1,64]
mov v23.16b,v25.16b /* working EFGH <- EFGH */
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
/*
* aes xform 0, sha quad 0
*/
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
ld1 {v6.16b},[x8],16 /* key2 */
rev32 v28.16b,v28.16b /* fix endian w2 */
ld1 {v7.16b},[x8],16 /* key3 */
/* read next aes block, no update */
ld1 {v1.16b},[x0]
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
rev32 v29.16b,v29.16b /* fix endian w3 */
sha256h2 q23, q21, v4.4s
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
sha256su1 v26.4s,v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
sha256h q22, q23, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v5.4s
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
ld1 {v5.16b},[x8],16 /* key5 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v29.4s,v26.4s
sha256h q22, q23, v7.4s
___
&aes192_aes256_dec_handle(1,"dec_epilog",0,0);
$code.=<<___;
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
ld1 {v6.16b},[x8],16 /* key6 */
eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
/* read next aes block, update aes_ptr_in */
ld1 {v30.16b},[x0],16
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
/* aes xform 1, sha quad 1 */
sha256su0 v26.4s,v27.4s
ld1 {v7.16b},[x8],16 /* key7 */
mov v21.16b, v22.16b /* copy abcd */
/* save aes res, bump aes_out_ptr */
st1 {v0.16b},[x1],16
aesd v1.16b,v8.16b
aesimc v1.16b,v1.16b
sha256h q22, q23, v4.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256h2 q23, q21, v4.4s
sha256su1 v26.4s,v28.4s,v29.4s
aesd v1.16b,v9.16b
aesimc v1.16b,v1.16b
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v1.16b,v10.16b
aesimc v1.16b,v1.16b
/* read next aes block, no update */
ld1 {v2.16b},[x0]
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v4.16b},[x8],16 /* key4 */
aesd v1.16b,v11.16b
aesimc v1.16b,v1.16b
ld1 {v5.16b},[x8],16 /* key5 */
mov v21.16b, v22.16b /* copy abcd */
sha256su0 v28.4s,v29.4s
sha256h q22, q23, v6.4s
aesd v1.16b,v12.16b
aesimc v1.16b,v1.16b
sha256h2 q23, q21, v6.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aesd v1.16b,v13.16b
aesimc v1.16b,v1.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v1.16b,v14.16b
aesimc v1.16b,v1.16b
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
aesd v1.16b,v15.16b
aesimc v1.16b,v1.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
___
&aes192_aes256_dec_handle(1,"dec_epilog",1,0);
$code.=<<___;
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
/* read next aes block, update aes_ptr_in */
ld1 {v31.16b},[x0],16
/* mode op 2 */
/* aes xform 2, sha quad 2 */
sha256su0 v26.4s,v27.4s
aesd v2.16b,v8.16b
aesimc v2.16b,v2.16b
/* save aes res, bump aes_out_ptr */
st1 {v1.16b},[x1],16
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aesd v2.16b,v9.16b
aesimc v2.16b,v2.16b
sha256su1 v26.4s,v28.4s,v29.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256su0 v27.4s,v28.4s
aesd v2.16b,v10.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v2.16b,v11.16b
aesimc v2.16b,v2.16b
sha256su1 v27.4s,v29.4s,v26.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256su0 v28.4s,v29.4s
aesd v2.16b,v12.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v2.16b,v13.16b
aesimc v2.16b,v2.16b
sha256su1 v28.4s,v26.4s,v27.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v29.4s,v26.4s
/* read next aes block, no update */
ld1 {v3.16b},[x0]
aesd v2.16b,v14.16b
aesimc v2.16b,v2.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v2.16b,v15.16b
aesimc v2.16b,v2.16b
sha256su1 v29.4s,v27.4s,v28.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
aesd v2.16b,v16.16b
aesimc v2.16b,v2.16b
___
&aes192_aes256_dec_handle(1,"dec_epilog",2,0);
$code.=<<___;
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
/* read next aes block, update aes_ptr_in */
ld1 {v30.16b},[x0],16
/* mode op 3 */
/* aes xform 3, sha quad 3 (hash only) */
aesd v3.16b,v8.16b
aesimc v3.16b,v3.16b
/* save aes res, bump aes_out_ptr */
st1 {v2.16b},[x1],16
aesd v3.16b,v9.16b
aesimc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aesd v3.16b,v10.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v11.16b
aesimc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v3.16b,v12.16b
aesimc v3.16b,v3.16b
/* read first aes block, no bump */
ld1 {v0.16b},[x0]
aesd v3.16b,v13.16b
aesimc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v3.16b,v14.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v15.16b
aesimc v3.16b,v3.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
aesd v3.16b,v16.16b
aesimc v3.16b,v3.16b
___
&aes192_aes256_dec_handle(1,"dec_epilog",3,0);
$code.=<<___;
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
eor v3.16b,v3.16b,v31.16b /* xor w/prev value */
/* read first aes block, bump aes_ptr_in */
ld1 {v31.16b},[x0],16
/*
* now we have to do the 4 aes blocks (b-2) that catch up to where sha is
*/
/* aes xform 0 */
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
/* read next aes block, no update */
ld1 {v1.16b},[x0]
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
___
&aes192_aes256_dec_handle(1,"dec_catchup",0,0);
$code.=<<___;
eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
/* read next aes block, update aes_ptr_in */
ld1 {v30.16b},[x0],16
/* aes xform 1 */
/* read next aes block, no update */
ld1 {v2.16b},[x0]
aesd v1.16b,v8.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v9.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v10.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v11.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v12.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v13.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v14.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v15.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
___
&aes192_aes256_dec_handle(1,"dec_catchup",1,0);
$code.=<<___;
eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
/* read next aes block, update aes_ptr_in */
ld1 {v31.16b},[x0],16
/* aes xform 2 */
aesd v2.16b,v8.16b
aesimc v2.16b,v2.16b
/* read next aes block, no update */
ld1 {v3.16b},[x0]
aesd v2.16b,v9.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v10.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v11.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v12.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v13.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v14.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v15.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v16.16b
aesimc v2.16b,v2.16b
___
&aes192_aes256_dec_handle(1,"dec_catchup",2,0);
$code.=<<___;
eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
/* read next aes block, update aes_ptr_in */
ld1 {v30.16b},[x0],16
/* aes xform 3 */
aesd v3.16b,v8.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v9.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v10.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v11.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v12.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v13.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v14.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v15.16b
aesimc v3.16b,v3.16b
eor v26.16b,v26.16b,v26.16b
eor v27.16b,v27.16b,v27.16b
aesd v3.16b,v16.16b
aesimc v3.16b,v3.16b
eor v28.16b,v28.16b,v28.16b
eor v29.16b,v29.16b,v29.16b
___
&aes192_aes256_dec_handle(1,"dec_catchup",3,0);
$code.=<<___;
eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
add x19,x19,4
/*
* Now, there is the final b-1 sha256 padded block.
* This contains between 0-3 aes blocks. We take some pains to avoid read spill
* by only reading the blocks that are actually defined.
* This is also the final sha block code for the shortCases.
*/
.Ljoin_common:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
mov w15,0x80 /* that's the 1 of the pad */
.Lpost_loop_Q0:
/* assume this was final block */
mov v26.b[0],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
/* overwrite previous v26 value (0x80) */
mov v26.d[0],x2
/* assume this was final block */
mov v26.b[8],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
mov v26.d[1],x2
.Lpost_loop_Q1:
/* assume this is final block */
mov v27.b[0],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
/* overwrite previous v27 value (0x80) */
mov v27.d[0],x2
/* assume this was final block */
mov v27.b[8],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
mov v27.d[1],x2
.Lpost_loop_Q2:
/* assume this was final block */
mov v28.b[0],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
/* overwrite previous v28 value (0x80) */
mov v28.d[0],x2
/* assume this was final block */
mov v28.b[8],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
mov v28.d[1],x2
.Lpost_loop_Q3:
/* assume this was final block */
mov v29.b[3],w15
/* outstanding 8B blocks left */
cbz x5,.Lpost_loop
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
sub x5,x5,8
rev32 x2,x2
/* overwrite previous v29 value (0x80) */
mov v29.d[0],x2
/* assume this was final block */
mov v29.b[11],w15
/* outstanding 8B blocks left */
cbz x5,1f
/* at least 8B left to go, it is safe to fetch this data */
ldr x2,[x3],8
rev32 x2,x2
mov v29.d[1],x2
/*
* That is enough of blocks, we allow up to 64 bytes in total.
* Now we have the sha256 to do for these 4 16B blocks.
*/
1:
mov x9,x8
rev32 v26.16b,v26.16b
ld1 {v4.16b},[x9],16 /* key0 */
rev32 v27.16b,v27.16b
rev32 v28.16b,v28.16b
ld1 {v5.16b},[x9],16 /* key1 */
//rev32 v29.16b,v29.16b
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key2 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key3 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
sha256h2 q23, q21, v7.4s
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
/* this was final block */
cbz x5,.Lpost_loop
subs x5,x5,8
/* loop if hash is not finished */
b.ne .Lpost_loop_Q0
/* set "1" of the padding if this was a final block */
mov v26.b[0],w15
.Lpost_loop:
/* Add outstanding bytes of digest source */
add x11,x11,x20
/* Add one SHA-2 block since hash is calculated including i_key_pad */
add x11,x11,#64
lsr x12,x11,32 /* len_hi */
and x14,x11,0xffffffff /* len_lo */
lsl x12,x12,3 /* len_hi in bits */
lsl x14,x14,3 /* len_lo in bits */
mov v29.s[3],w14 /* len_lo */
mov v29.s[2],w12 /* len_hi */
rev32 v26.16b,v26.16b /* fix endian w0 */
mov v22.16b,v24.16b /* working ABCD <- ABCD */
rev32 v27.16b,v27.16b /* fix endian w1 */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
rev32 v28.16b,v28.16b /* fix endian w2 */
/* skip write back if there were less than 4 AES blocks */
cbz x19,1f
/*
* At this point all data should be fetched for SHA.
* Save remaining blocks without danger of overwriting SHA source.
*/
stp q0,q1,[x1],32
stp q2,q3,[x1],32
1:
/*
* final sha block
* the strategy is to combine the 0-3 aes blocks, which is faster but
* a little gourmand on code space.
*/
cbz x13,.Lzero_aes_blocks_left /* none to do */
/* read first aes block, bump aes_ptr_in */
ld1 {v0.16b},[x0]
ld1 {v31.16b},[x0],16
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
ld1 {v5.16b},[x8],16 /* key1 */
ld1 {v6.16b},[x8],16 /* key2 */
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
ld1 {v7.16b},[x8],16 /* key3 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
sha256h2 q23, q21, v4.4s
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
sha256h2 q23, q21, v6.4s
sha256su1 v28.4s,v26.4s,v27.4s
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
___
&aes192_aes256_dec_handle(1,"dec_final1",0,0);
$code.=<<___;
sha256su1 v29.4s,v27.4s,v28.4s
eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
sub x13,x13,1 /* dec counter */
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
cbz x13,.Lfrmquad1
/* aes xform 1 */
/* read first aes block, bump aes_ptr_in */
ld1 {v0.16b},[x0]
ld1 {v30.16b},[x0],16
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
sha256h q22, q23, v4.4s
sha256h2 q23, q21, v4.4s
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
sha256h2 q23, q21, v5.4s
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
sha256su1 v27.4s,v29.4s,v26.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su0 v28.4s,v29.4s
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
___
&aes192_aes256_dec_handle(1,"dec_final2",0,0);
$code.=<<___;
sha256su1 v29.4s,v27.4s,v28.4s
eor v3.16b,v0.16b,v31.16b /* xor w/ ivec (modeop) */
sub x13,x13,1 /* dec counter */
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
cbz x13,.Lfrmquad2
/* aes xform 2 */
/* read first aes block, bump aes_ptr_in */
ld1 {v0.16b},[x0],16
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
ld1 {v6.16b},[x8],16 /* key6 */
ld1 {v7.16b},[x8],16 /* key7 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
sha256h2 q23, q21, v4.4s
sha256su1 v26.4s,v28.4s,v29.4s
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
sha256h2 q23, q21, v5.4s
sha256su1 v27.4s,v29.4s,v26.4s
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
sha256su1 v28.4s,v26.4s,v27.4s
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
___
&aes192_aes256_dec_handle(1,"dec_final3",0,0);
$code.=<<___;
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
eor v3.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
/* save aes res, bump aes_out_ptr */
st1 {v3.16b},[x1],16
b .Lfrmquad3
/*
* the final block with no aes component, i.e from here there were zero blocks
*/
.Lzero_aes_blocks_left:
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
.Lfrmquad1:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
.Lfrmquad2:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
.Lfrmquad3:
ld1 {v4.16b},[x8],16 /* key4 */
ld1 {v5.16b},[x8],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
eor v26.16b,v26.16b,v26.16b /* zero reg */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
eor v27.16b,v27.16b,v27.16b /* zero reg */
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
eor v28.16b,v28.16b,v28.16b /* zero reg */
sha256h2 q23, q21, v7.4s
add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
eor v29.16b,v29.16b,v29.16b /* zero reg */
add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
/*
* Calculate final HMAC
*/
/* base address for sha round consts */
adrp x8,.Lrcon
add x8,x8,:lo12:.Lrcon
/* load o_key_pad partial hash */
ld1 {v24.16b},[x7],16
ld1 {v25.16b},[x7]
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
/* Set padding 1 to the first reg */
mov w11, #0x80 /* that's the 1 of the pad */
mov v28.b[3], w11
/* size of o_key_pad + inner hash */
mov x11, #64+32
lsl x11, x11, 3
/* move length to the end of the block */
mov v29.s[3], w11
lsr x11, x11, 32
mov v29.s[2], w11 /* and the higher part */
ld1 {v4.16b},[x8],16 /* key0 */
ld1 {v5.16b},[x8],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key2 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key3 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x8],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x8],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x8],16 /* key8 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x8],16 /* key9 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key10 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key11 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x8],16 /* key12 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x8],16 /* key13 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x8],16 /* key14 */
add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x8],16 /* key15 */
add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
ldp d10,d11,[sp,#16]
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ldp d12,d13,[sp,#32]
sha256h2 q23, q21, v7.4s
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
ldp d14,d15,[sp,#48]
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
rev32 v24.16b, v24.16b
ldp x19,x20,[sp,#64]
ldp d8,d9,[sp],#80
rev32 v25.16b, v25.16b
st1 {v24.4s},[x4],16
mov x0, xzr
st1 {v25.4s},[x4]
ret
/*
* These are the short cases (less efficient), here used for 1-11 aes blocks.
* x10 = aes_blocks
*/
.Ldec_short_cases:
ldp q8,q9,[x9],32
adrp x8,.Lrcon /* rcon */
add x8,x8,:lo12:.Lrcon
ldp q10,q11,[x9],32
lsl x11,x10,4 /* len=aes_blocks*16 */
ldp q12,q13,[x9],32
ldp q14,q15,[x9],32
ld1 {v30.16b},[x6] /* get ivec */
ldp q16,q17,[x9],32
ld1 {v18.16b},[x9]
/* get outstanding bytes of the digest */
sub x20,x5,x2
/* indicate AES blocks to write back */
mov x19,xzr
mov x2,x0
/*
* Digest source has to be at least of cipher source length
* therefore it is safe to use x10 to indicate whether we can
* overtake cipher processing by 4 AES block here.
*/
cmp x10,4 /* check if 4 or more */
/* if less, bail to last block */
blt .Llast_sha_block
sub x5,x5,64
mov x9,x8 /* top of rcon */
/* quad 0 */
ld1 {v26.16b},[x3],16
ld1 {v4.16b},[x9],16 /* key0 */
ld1 {v27.16b},[x3],16
rev32 v26.16b,v26.16b
ld1 {v28.16b},[x3],16
rev32 v27.16b,v27.16b
ld1 {v29.16b},[x3],16
rev32 v28.16b,v28.16b
ld1 {v5.16b},[x9],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
rev32 v29.16b,v29.16b
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key2 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key3 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
/* there were at least 4 AES blocks to process */
b .Lshort_loop_no_store
.Ldec_short_loop:
cmp x10,4 /* check if 4 or more */
/* if less, bail to last block */
blt .Llast_sha_block
stp q0,q1,[x1],32
stp q2,q3,[x1],32
sub x19,x19,4
.Lshort_loop_no_store:
ld1 {v31.16b},[x2] /* next w no update */
/* read next aes block, update aes_ptr_in */
ld1 {v0.16b},[x2],16
add x0,x0,64
/* aes xform 0 */
aesd v0.16b,v8.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v9.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v10.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v11.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v12.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v13.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v14.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v15.16b
aesimc v0.16b,v0.16b
aesd v0.16b,v16.16b
aesimc v0.16b,v0.16b
___
&aes192_aes256_dec_handle(1,"dec_short",0,0);
$code.=<<___;
eor v0.16b,v0.16b,v30.16b /* xor w/prev value */
ld1 {v30.16b},[x2] /* read no update */
/* read next aes block, update aes_ptr_in */
ld1 {v1.16b},[x2],16
/* aes xform 1 */
aesd v1.16b,v8.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v9.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v10.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v11.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v12.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v13.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v14.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v15.16b
aesimc v1.16b,v1.16b
aesd v1.16b,v16.16b
aesimc v1.16b,v1.16b
___
&aes192_aes256_dec_handle(1,"dec_short",1,0);
$code.=<<___;
eor v1.16b,v1.16b,v31.16b /* xor w/prev value */
ld1 {v31.16b},[x2] /* read no update */
/* read next aes block, update aes_ptr_in */
ld1 {v2.16b},[x2],16
/* aes xform 2 */
aesd v2.16b,v8.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v9.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v10.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v11.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v12.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v13.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v14.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v15.16b
aesimc v2.16b,v2.16b
aesd v2.16b,v16.16b
aesimc v2.16b,v2.16b
___
&aes192_aes256_dec_handle(1,"dec_short",2,0);
$code.=<<___;
eor v2.16b,v2.16b,v30.16b /* xor w/prev value */
ld1 {v30.16b},[x2] /* read no update */
/* read next aes block, update aes_ptr_in */
ld1 {v3.16b},[x2],16
/* aes xform 3 */
aesd v3.16b,v8.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v9.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v10.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v11.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v12.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v13.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v14.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v15.16b
aesimc v3.16b,v3.16b
aesd v3.16b,v16.16b
aesimc v3.16b,v3.16b
___
&aes192_aes256_dec_handle(1,"dec_short",3,0);
$code.=<<___;
eor v3.16b,v3.16b,v31.16b /* xor w/prev value */
add x19,x19,4
sub x10,x10,4 /* 4 less */
cmp x5,64
b.lt .Ldec_short_loop /* keep looping */
sub x5,x5,64
mov x9,x8 /* top of rcon */
/* quad 0 */
ld1 {v26.16b},[x3],16
ld1 {v4.16b},[x9],16 /* key0 */
ld1 {v27.16b},[x3],16
rev32 v26.16b,v26.16b
ld1 {v28.16b},[x3],16
rev32 v27.16b,v27.16b
ld1 {v29.16b},[x3],16
rev32 v28.16b,v28.16b
ld1 {v5.16b},[x9],16 /* key1 */
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
rev32 v29.16b,v29.16b
sha256su0 v26.4s,v27.4s
mov v22.16b,v24.16b /* working ABCD <- ABCD */
mov v23.16b,v25.16b /* working EFGH <- EFGH */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key2 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key3 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 1 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 2 */
sha256su0 v26.4s,v27.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
sha256h2 q23, q21, v4.4s
add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
sha256su1 v26.4s,v28.4s,v29.4s
sha256su0 v27.4s,v28.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
sha256h2 q23, q21, v5.4s
add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
sha256su1 v27.4s,v29.4s,v26.4s
sha256su0 v28.4s,v29.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
ld1 {v4.16b},[x9],16 /* key4 */
sha256h2 q23, q21, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
sha256su1 v28.4s,v26.4s,v27.4s
sha256su0 v29.4s,v26.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
ld1 {v5.16b},[x9],16 /* key5 */
sha256h2 q23, q21, v7.4s
add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
sha256su1 v29.4s,v27.4s,v28.4s
/* quad 3 */
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v4.4s
ld1 {v6.16b},[x9],16 /* key6 */
add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
sha256h2 q23, q21, v4.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v5.4s
ld1 {v7.16b},[x9],16 /* key7 */
add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
sha256h2 q23, q21, v5.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v6.4s
add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
sha256h2 q23, q21, v6.4s
mov v21.16b, v22.16b /* copy abcd */
sha256h q22, q23, v7.4s
sha256h2 q23, q21, v7.4s
add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
b .Ldec_short_loop /* keep looping */
/*
* This is arranged so that we can join the common unwind code that does
* the last sha block and the final 0-3 aes blocks.
*/
.Llast_sha_block:
eor v26.16b,v26.16b,v26.16b /* zero the rest */
eor v27.16b,v27.16b,v27.16b /* zero the rest */
eor v28.16b,v28.16b,v28.16b /* zero the rest */
eor v29.16b,v29.16b,v29.16b /* zero the rest */
mov x13,x10 /* copy aes blocks for common */
b .Ljoin_common /* join common code */
.size asm_sha256_hmac_aescbc_dec, .-asm_sha256_hmac_aescbc_dec
___
if ($flavour =~ /64/) {
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
print $_,"\n";
}
}
close STDOUT or die "error closing STDOUT: $!";