mirror of https://github.com/openssl/openssl.git
1533 lines
26 KiB
Perl
Executable File
1533 lines
26 KiB
Perl
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
#
|
|
# ====================================================================
|
|
# Written by Danny Tsen <dtsen@us.ibm.com> # for the OpenSSL project.
|
|
#
|
|
# Copyright 2025- IBM Corp.
|
|
# ====================================================================
|
|
#
|
|
# p384 lower-level primitives for PPC64.
|
|
#
|
|
|
|
|
|
use strict;
|
|
use warnings;
|
|
|
|
my $flavour = shift;
|
|
my $output = "";
|
|
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
if (!$output) {
|
|
$output = "-";
|
|
}
|
|
|
|
my ($xlate, $dir);
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
|
|
die "can't locate ppc-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT=*OUT;
|
|
|
|
my $code = "";
|
|
|
|
$code.=<<___;
|
|
.machine "any"
|
|
.text
|
|
|
|
.globl p384_felem_mul
|
|
.type p384_felem_mul,\@function
|
|
.align 4
|
|
p384_felem_mul:
|
|
|
|
stdu 1, -176(1)
|
|
mflr 0
|
|
std 14, 56(1)
|
|
std 15, 64(1)
|
|
std 16, 72(1)
|
|
std 17, 80(1)
|
|
std 18, 88(1)
|
|
std 19, 96(1)
|
|
std 20, 104(1)
|
|
std 21, 112(1)
|
|
std 22, 120(1)
|
|
|
|
bl _p384_felem_mul_core
|
|
|
|
mtlr 0
|
|
ld 14, 56(1)
|
|
ld 15, 64(1)
|
|
ld 16, 72(1)
|
|
ld 17, 80(1)
|
|
ld 18, 88(1)
|
|
ld 19, 96(1)
|
|
ld 20, 104(1)
|
|
ld 21, 112(1)
|
|
ld 22, 120(1)
|
|
addi 1, 1, 176
|
|
blr
|
|
.size p384_felem_mul,.-p384_felem_mul
|
|
|
|
.globl p384_felem_square
|
|
.type p384_felem_square,\@function
|
|
.align 4
|
|
p384_felem_square:
|
|
|
|
stdu 1, -176(1)
|
|
mflr 0
|
|
std 14, 56(1)
|
|
std 15, 64(1)
|
|
std 16, 72(1)
|
|
std 17, 80(1)
|
|
|
|
bl _p384_felem_square_core
|
|
|
|
mtlr 0
|
|
ld 14, 56(1)
|
|
ld 15, 64(1)
|
|
ld 16, 72(1)
|
|
ld 17, 80(1)
|
|
addi 1, 1, 176
|
|
blr
|
|
.size p384_felem_square,.-p384_felem_square
|
|
|
|
#
|
|
# Felem mul core function -
|
|
# r3, r4 and r5 need to pre-loaded.
|
|
#
|
|
.type _p384_felem_mul_core,\@function
|
|
.align 4
|
|
_p384_felem_mul_core:
|
|
|
|
ld 6,0(4)
|
|
ld 14,0(5)
|
|
ld 7,8(4)
|
|
ld 15,8(5)
|
|
ld 8,16(4)
|
|
ld 16,16(5)
|
|
ld 9,24(4)
|
|
ld 17,24(5)
|
|
ld 10,32(4)
|
|
ld 18,32(5)
|
|
ld 11,40(4)
|
|
ld 19,40(5)
|
|
ld 12,48(4)
|
|
ld 20,48(5)
|
|
|
|
# out0
|
|
mulld 21, 14, 6
|
|
mulhdu 22, 14, 6
|
|
std 21, 0(3)
|
|
std 22, 8(3)
|
|
|
|
vxor 0, 0, 0
|
|
|
|
# out1
|
|
mtvsrdd 32+13, 14, 6
|
|
mtvsrdd 32+14, 7, 15
|
|
vmsumudm 1, 13, 14, 0
|
|
|
|
# out2
|
|
mtvsrdd 32+15, 15, 6
|
|
mtvsrdd 32+16, 7, 16
|
|
mtvsrdd 32+17, 0, 8
|
|
mtvsrdd 32+18, 0, 14
|
|
vmsumudm 19, 15, 16, 0
|
|
vmsumudm 2, 17, 18, 19
|
|
|
|
# out3
|
|
mtvsrdd 32+13, 16, 6
|
|
mtvsrdd 32+14, 7, 17
|
|
mtvsrdd 32+15, 14, 8
|
|
mtvsrdd 32+16, 9, 15
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 3, 15, 16, 19
|
|
|
|
# out4
|
|
mtvsrdd 32+13, 17, 6
|
|
mtvsrdd 32+14, 7, 18
|
|
mtvsrdd 32+15, 15, 8
|
|
mtvsrdd 32+16, 9, 16
|
|
mtvsrdd 32+17, 0, 10
|
|
mtvsrdd 32+18, 0, 14
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 4, 15, 16, 19
|
|
vmsumudm 4, 17, 18, 4
|
|
|
|
# out5
|
|
mtvsrdd 32+13, 18, 6
|
|
mtvsrdd 32+14, 7, 19
|
|
mtvsrdd 32+15, 16, 8
|
|
mtvsrdd 32+16, 9, 17
|
|
mtvsrdd 32+17, 14, 10
|
|
mtvsrdd 32+18, 11, 15
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 5, 15, 16, 19
|
|
vmsumudm 5, 17, 18, 5
|
|
|
|
stxv 32+1, 16(3)
|
|
stxv 32+2, 32(3)
|
|
stxv 32+3, 48(3)
|
|
stxv 32+4, 64(3)
|
|
stxv 32+5, 80(3)
|
|
|
|
# out6
|
|
mtvsrdd 32+13, 19, 6
|
|
mtvsrdd 32+14, 7, 20
|
|
mtvsrdd 32+15, 17, 8
|
|
mtvsrdd 32+16, 9, 18
|
|
mtvsrdd 32+17, 15, 10
|
|
mtvsrdd 32+18, 11, 16
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 6, 15, 16, 19
|
|
mtvsrdd 32+13, 0, 12
|
|
mtvsrdd 32+14, 0, 14
|
|
vmsumudm 19, 17, 18, 6
|
|
vmsumudm 6, 13, 14, 19
|
|
|
|
# out7
|
|
mtvsrdd 32+13, 19, 7
|
|
mtvsrdd 32+14, 8, 20
|
|
mtvsrdd 32+15, 17, 9
|
|
mtvsrdd 32+16, 10, 18
|
|
mtvsrdd 32+17, 15, 11
|
|
mtvsrdd 32+18, 12, 16
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 7, 15, 16, 19
|
|
vmsumudm 7, 17, 18, 7
|
|
|
|
# out8
|
|
mtvsrdd 32+13, 19, 8
|
|
mtvsrdd 32+14, 9, 20
|
|
mtvsrdd 32+15, 17, 10
|
|
mtvsrdd 32+16, 11, 18
|
|
mtvsrdd 32+17, 0, 12
|
|
mtvsrdd 32+18, 0, 16
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 8, 15, 16, 19
|
|
vmsumudm 8, 17, 18, 8
|
|
|
|
# out9
|
|
mtvsrdd 32+13, 19, 9
|
|
mtvsrdd 32+14, 10, 20
|
|
mtvsrdd 32+15, 17, 11
|
|
mtvsrdd 32+16, 12, 18
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 9, 15, 16, 19
|
|
|
|
# out10
|
|
mtvsrdd 32+13, 19, 10
|
|
mtvsrdd 32+14, 11, 20
|
|
mtvsrdd 32+15, 0, 12
|
|
mtvsrdd 32+16, 0, 18
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 10, 15, 16, 19
|
|
|
|
# out11
|
|
mtvsrdd 32+17, 19, 11
|
|
mtvsrdd 32+18, 12, 20
|
|
vmsumudm 11, 17, 18, 0
|
|
|
|
stxv 32+6, 96(3)
|
|
stxv 32+7, 112(3)
|
|
stxv 32+8, 128(3)
|
|
stxv 32+9, 144(3)
|
|
stxv 32+10, 160(3)
|
|
stxv 32+11, 176(3)
|
|
|
|
# out12
|
|
mulld 21, 20, 12
|
|
mulhdu 22, 20, 12 # out12
|
|
|
|
std 21, 192(3)
|
|
std 22, 200(3)
|
|
|
|
blr
|
|
.size _p384_felem_mul_core,.-_p384_felem_mul_core
|
|
|
|
#
|
|
# Felem square core function -
|
|
# r3 and r4 need to pre-loaded.
|
|
#
|
|
.type _p384_felem_square_core,\@function
|
|
.align 4
|
|
_p384_felem_square_core:
|
|
|
|
ld 6, 0(4)
|
|
ld 7, 8(4)
|
|
ld 8, 16(4)
|
|
ld 9, 24(4)
|
|
ld 10, 32(4)
|
|
ld 11, 40(4)
|
|
ld 12, 48(4)
|
|
|
|
vxor 0, 0, 0
|
|
|
|
# out0
|
|
mulld 14, 6, 6
|
|
mulhdu 15, 6, 6
|
|
std 14, 0(3)
|
|
std 15, 8(3)
|
|
|
|
# out1
|
|
add 14, 6, 6
|
|
mtvsrdd 32+13, 0, 14
|
|
mtvsrdd 32+14, 0, 7
|
|
vmsumudm 1, 13, 14, 0
|
|
|
|
# out2
|
|
mtvsrdd 32+15, 7, 14
|
|
mtvsrdd 32+16, 7, 8
|
|
vmsumudm 2, 15, 16, 0
|
|
|
|
# out3
|
|
add 15, 7, 7
|
|
mtvsrdd 32+13, 8, 14
|
|
mtvsrdd 32+14, 15, 9
|
|
vmsumudm 3, 13, 14, 0
|
|
|
|
# out4
|
|
mtvsrdd 32+13, 9, 14
|
|
mtvsrdd 32+14, 15, 10
|
|
mtvsrdd 32+15, 0, 8
|
|
vmsumudm 4, 13, 14, 0
|
|
vmsumudm 4, 15, 15, 4
|
|
|
|
# out5
|
|
mtvsrdd 32+13, 10, 14
|
|
mtvsrdd 32+14, 15, 11
|
|
add 16, 8, 8
|
|
mtvsrdd 32+15, 0, 16
|
|
mtvsrdd 32+16, 0, 9
|
|
vmsumudm 5, 13, 14, 0
|
|
vmsumudm 5, 15, 16, 5
|
|
|
|
stxv 32+1, 16(3)
|
|
stxv 32+2, 32(3)
|
|
stxv 32+3, 48(3)
|
|
stxv 32+4, 64(3)
|
|
|
|
# out6
|
|
mtvsrdd 32+13, 11, 14
|
|
mtvsrdd 32+14, 15, 12
|
|
mtvsrdd 32+15, 9, 16
|
|
mtvsrdd 32+16, 9, 10
|
|
stxv 32+5, 80(3)
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 6, 15, 16, 19
|
|
|
|
# out7
|
|
add 17, 9, 9
|
|
mtvsrdd 32+13, 11, 15
|
|
mtvsrdd 32+14, 16, 12
|
|
mtvsrdd 32+15, 0, 17
|
|
mtvsrdd 32+16, 0, 10
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 7, 15, 16, 19
|
|
|
|
# out8
|
|
mtvsrdd 32+13, 11, 16
|
|
mtvsrdd 32+14, 17, 12
|
|
mtvsrdd 32+15, 0, 10
|
|
vmsumudm 19, 13, 14, 0
|
|
vmsumudm 8, 15, 15, 19
|
|
|
|
# out9
|
|
add 14, 10, 10
|
|
mtvsrdd 32+13, 11, 17
|
|
mtvsrdd 32+14, 14, 12
|
|
vmsumudm 9, 13, 14, 0
|
|
|
|
# out10
|
|
mtvsrdd 32+13, 11, 14
|
|
mtvsrdd 32+14, 11, 12
|
|
vmsumudm 10, 13, 14, 0
|
|
|
|
stxv 32+6, 96(3)
|
|
stxv 32+7, 112(3)
|
|
|
|
# out11
|
|
#add 14, 11, 11
|
|
#mtvsrdd 32+13, 0, 14
|
|
#mtvsrdd 32+14, 0, 12
|
|
#vmsumudm 11, 13, 14, 0
|
|
|
|
mulld 6, 12, 11
|
|
mulhdu 7, 12, 11
|
|
addc 8, 6, 6
|
|
adde 9, 7, 7
|
|
|
|
stxv 32+8, 128(3)
|
|
stxv 32+9, 144(3)
|
|
stxv 32+10, 160(3)
|
|
#stxv 32+11, 176(3)
|
|
|
|
# out12
|
|
mulld 14, 12, 12
|
|
mulhdu 15, 12, 12
|
|
|
|
std 8, 176(3)
|
|
std 9, 184(3)
|
|
std 14, 192(3)
|
|
std 15, 200(3)
|
|
|
|
blr
|
|
.size _p384_felem_square_core,.-_p384_felem_square_core
|
|
|
|
#
|
|
# widefelem (128 bits) * 8
|
|
#
|
|
.macro F128_X_8 _off1 _off2
|
|
ld 9,\\_off1(3)
|
|
ld 8,\\_off2(3)
|
|
srdi 10,9,61
|
|
rldimi 10,8,3,0
|
|
sldi 9,9,3
|
|
std 9,\\_off1(3)
|
|
std 10,\\_off2(3)
|
|
.endm
|
|
|
|
.globl p384_felem128_mul_by_8
|
|
.type p384_felem128_mul_by_8, \@function
|
|
.align 4
|
|
p384_felem128_mul_by_8:
|
|
|
|
F128_X_8 0, 8
|
|
|
|
F128_X_8 16, 24
|
|
|
|
F128_X_8 32, 40
|
|
|
|
F128_X_8 48, 56
|
|
|
|
F128_X_8 64, 72
|
|
|
|
F128_X_8 80, 88
|
|
|
|
F128_X_8 96, 104
|
|
|
|
F128_X_8 112, 120
|
|
|
|
F128_X_8 128, 136
|
|
|
|
F128_X_8 144, 152
|
|
|
|
F128_X_8 160, 168
|
|
|
|
F128_X_8 176, 184
|
|
|
|
F128_X_8 192, 200
|
|
|
|
blr
|
|
.size p384_felem128_mul_by_8,.-p384_felem128_mul_by_8
|
|
|
|
#
|
|
# widefelem (128 bits) * 2
|
|
#
|
|
.macro F128_X_2 _off1 _off2
|
|
ld 9,\\_off1(3)
|
|
ld 8,\\_off2(3)
|
|
srdi 10,9,63
|
|
rldimi 10,8,1,0
|
|
sldi 9,9,1
|
|
std 9,\\_off1(3)
|
|
std 10,\\_off2(3)
|
|
.endm
|
|
|
|
.globl p384_felem128_mul_by_2
|
|
.type p384_felem128_mul_by_2, \@function
|
|
.align 4
|
|
p384_felem128_mul_by_2:
|
|
|
|
F128_X_2 0, 8
|
|
|
|
F128_X_2 16, 24
|
|
|
|
F128_X_2 32, 40
|
|
|
|
F128_X_2 48, 56
|
|
|
|
F128_X_2 64, 72
|
|
|
|
F128_X_2 80, 88
|
|
|
|
F128_X_2 96, 104
|
|
|
|
F128_X_2 112, 120
|
|
|
|
F128_X_2 128, 136
|
|
|
|
F128_X_2 144, 152
|
|
|
|
F128_X_2 160, 168
|
|
|
|
F128_X_2 176, 184
|
|
|
|
F128_X_2 192, 200
|
|
|
|
blr
|
|
.size p384_felem128_mul_by_2,.-p384_felem128_mul_by_2
|
|
|
|
.globl p384_felem_diff128
|
|
.type p384_felem_diff128, \@function
|
|
.align 4
|
|
p384_felem_diff128:
|
|
|
|
addis 5, 2, .LConst_two127\@toc\@ha
|
|
addi 5, 5, .LConst_two127\@toc\@l
|
|
|
|
ld 10, 0(3)
|
|
ld 8, 8(3)
|
|
li 9, 0
|
|
addc 10, 10, 9
|
|
li 7, -1
|
|
rldicr 7, 7, 0, 0 # two127
|
|
adde 8, 8, 7
|
|
ld 11, 0(4)
|
|
ld 12, 8(4)
|
|
subfc 11, 11, 10
|
|
subfe 12, 12, 8
|
|
std 11, 0(3) # out0
|
|
std 12, 8(3)
|
|
|
|
# two127m71 = (r10, r9)
|
|
ld 8, 16(3)
|
|
ld 7, 24(3)
|
|
ld 10, 24(5) # two127m71
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 16(4)
|
|
ld 12, 24(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 16(3) # out1
|
|
std 12, 24(3)
|
|
|
|
ld 8, 32(3)
|
|
ld 7, 40(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 32(4)
|
|
ld 12, 40(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 32(3) # out2
|
|
std 12, 40(3)
|
|
|
|
ld 8, 48(3)
|
|
ld 7, 56(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 48(4)
|
|
ld 12, 56(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 48(3) # out3
|
|
std 12, 56(3)
|
|
|
|
ld 8, 64(3)
|
|
ld 7, 72(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 64(4)
|
|
ld 12, 72(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 64(3) # out4
|
|
std 12, 72(3)
|
|
|
|
ld 8, 80(3)
|
|
ld 7, 88(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 80(4)
|
|
ld 12, 88(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 80(3) # out5
|
|
std 12, 88(3)
|
|
|
|
ld 8, 96(3)
|
|
ld 7, 104(3)
|
|
ld 6, 40(5) # two127p111m79m71
|
|
addc 8, 8, 9
|
|
adde 7, 7, 6
|
|
ld 11, 96(4)
|
|
ld 12, 104(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 96(3) # out6
|
|
std 12, 104(3)
|
|
|
|
ld 8, 112(3)
|
|
ld 7, 120(3)
|
|
ld 6, 56(5) # two127m119m71
|
|
addc 8, 8, 9
|
|
adde 7, 7, 6
|
|
ld 11, 112(4)
|
|
ld 12, 120(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 112(3) # out7
|
|
std 12, 120(3)
|
|
|
|
ld 8, 128(3)
|
|
ld 7, 136(3)
|
|
ld 6, 72(5) # two127m95m71
|
|
addc 8, 8, 9
|
|
adde 7, 7, 6
|
|
ld 11, 128(4)
|
|
ld 12, 136(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 128(3) # out8
|
|
std 12, 136(3)
|
|
|
|
ld 8, 144(3)
|
|
ld 7, 152(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 144(4)
|
|
ld 12, 152(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 144(3) # out9
|
|
std 12, 152(3)
|
|
|
|
ld 8, 160(3)
|
|
ld 7, 168(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 160(4)
|
|
ld 12, 168(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 160(3) # out10
|
|
std 12, 168(3)
|
|
|
|
ld 8, 176(3)
|
|
ld 7, 184(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 176(4)
|
|
ld 12, 184(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 176(3) # out11
|
|
std 12, 184(3)
|
|
|
|
ld 8, 192(3)
|
|
ld 7, 200(3)
|
|
addc 8, 8, 9
|
|
adde 7, 7, 10
|
|
ld 11, 192(4)
|
|
ld 12, 200(4)
|
|
subfc 11, 11, 8
|
|
subfe 12, 12, 7
|
|
std 11, 192(3) # out12
|
|
std 12, 200(3)
|
|
|
|
blr
|
|
.size p384_felem_diff128,.-p384_felem_diff128
|
|
|
|
.data
|
|
.align 4
|
|
.LConst_two127:
|
|
#two127
|
|
.long 0x00000000, 0x00000000, 0x00000000, 0x80000000
|
|
#two127m71
|
|
.long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff
|
|
#two127p111m79m71
|
|
.long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff
|
|
#two127m119m71
|
|
.long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff
|
|
#two127m95m71
|
|
.long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff
|
|
|
|
.text
|
|
|
|
.globl p384_felem_diff_128_64
|
|
.type p384_felem_diff_128_64, \@function
|
|
.align 4
|
|
p384_felem_diff_128_64:
|
|
addis 5, 2, .LConst_128_two64\@toc\@ha
|
|
addi 5, 5, .LConst_128_two64\@toc\@l
|
|
|
|
ld 9, 0(3)
|
|
ld 10, 8(3)
|
|
ld 8, 48(5) # two64p48m16
|
|
li 7, 0
|
|
addc 9, 9, 8
|
|
li 6, 1
|
|
adde 10, 10, 6
|
|
ld 11, 0(4)
|
|
subfc 8, 11, 9
|
|
subfe 12, 7, 10
|
|
std 8, 0(3) # out0
|
|
std 12, 8(3)
|
|
|
|
ld 9, 16(3)
|
|
ld 10, 24(3)
|
|
ld 8, 0(5) # two64m56m8
|
|
addc 9, 9, 8
|
|
addze 10, 10
|
|
ld 11, 8(4)
|
|
subfc 11, 11, 9
|
|
subfe 12, 7, 10
|
|
std 11, 16(3) # out1
|
|
std 12, 24(3)
|
|
|
|
ld 9, 32(3)
|
|
ld 10, 40(3)
|
|
ld 8, 16(5) # two64m32m8
|
|
addc 9, 9, 8
|
|
addze 10, 10
|
|
ld 11, 16(4)
|
|
subfc 11, 11, 9
|
|
subfe 12, 7, 10
|
|
std 11, 32(3) # out2
|
|
std 12, 40(3)
|
|
|
|
ld 10, 48(3)
|
|
ld 8, 56(3)
|
|
#ld 9, 32(5) # two64m8
|
|
li 9, -256 # two64m8
|
|
addc 10, 10, 9
|
|
addze 8, 8
|
|
ld 11, 24(4)
|
|
subfc 11, 11, 10
|
|
subfe 12, 7, 8
|
|
std 11, 48(3) # out3
|
|
std 12, 56(3)
|
|
|
|
ld 10, 64(3)
|
|
ld 8, 72(3)
|
|
addc 10, 10, 9
|
|
addze 8, 8
|
|
ld 11, 32(4)
|
|
subfc 11, 11, 10
|
|
subfe 12, 7, 8
|
|
std 11, 64(3) # out4
|
|
std 12, 72(3)
|
|
|
|
ld 10, 80(3)
|
|
ld 8, 88(3)
|
|
addc 10, 10, 9
|
|
addze 8, 8
|
|
ld 11, 40(4)
|
|
subfc 11, 11, 10
|
|
subfe 12, 7, 8
|
|
std 11, 80(3) # out5
|
|
std 12, 88(3)
|
|
|
|
ld 10, 96(3)
|
|
ld 8, 104(3)
|
|
addc 10, 10, 9
|
|
addze 9, 8
|
|
ld 11, 48(4)
|
|
subfc 11, 11, 10
|
|
subfe 12, 7, 9
|
|
std 11, 96(3) # out6
|
|
std 12, 104(3)
|
|
|
|
blr
|
|
.size p384_felem_diff_128_64,.-p384_felem_diff_128_64
|
|
|
|
.data
|
|
.align 4
|
|
.LConst_128_two64:
|
|
#two64m56m8
|
|
.long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000
|
|
#two64m32m8
|
|
.long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000
|
|
#two64m8
|
|
.long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000
|
|
#two64p48m16
|
|
.long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000
|
|
|
|
.LConst_two60:
|
|
#two60m52m4
|
|
.long 0xfffffff0, 0x0fefffff, 0x0, 0x0
|
|
#two60p44m12
|
|
.long 0xfffff000, 0x10000fff, 0x0, 0x0
|
|
#two60m28m4
|
|
.long 0xeffffff0, 0x0fffffff, 0x0, 0x0
|
|
#two60m4
|
|
.long 0xfffffff0, 0x0fffffff, 0x0, 0x0
|
|
|
|
.text
|
|
#
|
|
# static void felem_diff64(felem out, const felem in)
|
|
#
|
|
.globl p384_felem_diff64
|
|
.type p384_felem_diff64, \@function
|
|
.align 4
|
|
p384_felem_diff64:
|
|
addis 5, 2, .LConst_two60\@toc\@ha
|
|
addi 5, 5, .LConst_two60\@toc\@l
|
|
|
|
ld 9, 0(3)
|
|
ld 8, 16(5) # two60p44m12
|
|
li 7, 0
|
|
add 9, 9, 8
|
|
ld 11, 0(4)
|
|
subf 8, 11, 9
|
|
std 8, 0(3) # out0
|
|
|
|
ld 9, 8(3)
|
|
ld 8, 0(5) # two60m52m4
|
|
add 9, 9, 8
|
|
ld 11, 8(4)
|
|
subf 11, 11, 9
|
|
std 11, 8(3) # out1
|
|
|
|
ld 9, 16(3)
|
|
ld 8, 32(5) # two60m28m4
|
|
add 9, 9, 8
|
|
ld 11, 16(4)
|
|
subf 11, 11, 9
|
|
std 11, 16(3) # out2
|
|
|
|
ld 10, 24(3)
|
|
ld 9, 48(5) # two60m4
|
|
add 10, 10, 9
|
|
ld 12, 24(4)
|
|
subf 12, 12, 10
|
|
std 12, 24(3) # out3
|
|
|
|
ld 10, 32(3)
|
|
add 10, 10, 9
|
|
ld 11, 32(4)
|
|
subf 11, 11, 10
|
|
std 11, 32(3) # out4
|
|
|
|
ld 10, 40(3)
|
|
add 10, 10, 9
|
|
ld 12, 40(4)
|
|
subf 12, 12, 10
|
|
std 12, 40(3) # out5
|
|
|
|
ld 10, 48(3)
|
|
add 10, 10, 9
|
|
ld 11, 48(4)
|
|
subf 11, 11, 10
|
|
std 11, 48(3) # out6
|
|
|
|
blr
|
|
.size p384_felem_diff64,.-p384_felem_diff64
|
|
|
|
.text
|
|
#
|
|
# Shift 128 bits right <nbits>
|
|
#
|
|
.macro SHR o_h o_l in_h in_l nbits
|
|
srdi \\o_l, \\in_l, \\nbits # shift lower right <nbits>
|
|
rldimi \\o_l, \\in_h, 64-\\nbits, 0 # insert <64-nbits> from hi
|
|
srdi \\o_h, \\in_h, \\nbits # shift higher right <nbits>
|
|
.endm
|
|
|
|
#
|
|
# static void felem_reduce(felem out, const widefelem in)
|
|
#
|
|
.global p384_felem_reduce
|
|
.type p384_felem_reduce,\@function
|
|
.align 4
|
|
p384_felem_reduce:
|
|
|
|
stdu 1, -208(1)
|
|
mflr 0
|
|
std 14, 56(1)
|
|
std 15, 64(1)
|
|
std 16, 72(1)
|
|
std 17, 80(1)
|
|
std 18, 88(1)
|
|
std 19, 96(1)
|
|
std 20, 104(1)
|
|
std 21, 112(1)
|
|
std 22, 120(1)
|
|
std 23, 128(1)
|
|
std 24, 136(1)
|
|
std 25, 144(1)
|
|
std 26, 152(1)
|
|
std 27, 160(1)
|
|
std 28, 168(1)
|
|
std 29, 176(1)
|
|
std 30, 184(1)
|
|
std 31, 192(1)
|
|
|
|
bl _p384_felem_reduce_core
|
|
|
|
mtlr 0
|
|
ld 14, 56(1)
|
|
ld 15, 64(1)
|
|
ld 16, 72(1)
|
|
ld 17, 80(1)
|
|
ld 18, 88(1)
|
|
ld 19, 96(1)
|
|
ld 20, 104(1)
|
|
ld 21, 112(1)
|
|
ld 22, 120(1)
|
|
ld 23, 128(1)
|
|
ld 24, 136(1)
|
|
ld 25, 144(1)
|
|
ld 26, 152(1)
|
|
ld 27, 160(1)
|
|
ld 28, 168(1)
|
|
ld 29, 176(1)
|
|
ld 30, 184(1)
|
|
ld 31, 192(1)
|
|
addi 1, 1, 208
|
|
blr
|
|
.size p384_felem_reduce,.-p384_felem_reduce
|
|
|
|
#
|
|
# Felem reduction core function -
|
|
# r3 and r4 need to pre-loaded.
|
|
#
|
|
.type _p384_felem_reduce_core,\@function
|
|
.align 4
|
|
_p384_felem_reduce_core:
|
|
addis 12, 2, .LConst\@toc\@ha
|
|
addi 12, 12, .LConst\@toc\@l
|
|
|
|
# load constat p
|
|
ld 11, 8(12) # hi - two124m68
|
|
|
|
# acc[6] = in[6] + two124m68;
|
|
ld 26, 96(4) # in[6].l
|
|
ld 27, 96+8(4) # in[6].h
|
|
add 27, 27, 11
|
|
|
|
# acc[5] = in[5] + two124m68;
|
|
ld 24, 80(4) # in[5].l
|
|
ld 25, 80+8(4) # in[5].h
|
|
add 25, 25, 11
|
|
|
|
# acc[4] = in[4] + two124m68;
|
|
ld 22, 64(4) # in[4].l
|
|
ld 23, 64+8(4) # in[4].h
|
|
add 23, 23, 11
|
|
|
|
# acc[3] = in[3] + two124m68;
|
|
ld 20, 48(4) # in[3].l
|
|
ld 21, 48+8(4) # in[3].h
|
|
add 21, 21, 11
|
|
|
|
ld 11, 48+8(12) # hi - two124m92m68
|
|
|
|
# acc[2] = in[2] + two124m92m68;
|
|
ld 18, 32(4) # in[2].l
|
|
ld 19, 32+8(4) # in[2].h
|
|
add 19, 19, 11
|
|
|
|
ld 11, 16+8(12) # high - two124m116m68
|
|
|
|
# acc[1] = in[1] + two124m116m68;
|
|
ld 16, 16(4) # in[1].l
|
|
ld 17, 16+8(4) # in[1].h
|
|
add 17, 17, 11
|
|
|
|
ld 11, 32+8(12) # high - two124p108m76
|
|
|
|
# acc[0] = in[0] + two124p108m76;
|
|
ld 14, 0(4) # in[0].l
|
|
ld 15, 0+8(4) # in[0].h
|
|
add 15, 15, 11
|
|
|
|
# compute mask
|
|
li 7, -1
|
|
|
|
# Eliminate in[12]
|
|
|
|
# acc[8] += in[12] >> 32;
|
|
ld 5, 192(4) # in[12].l
|
|
ld 6, 192+8(4) # in[12].h
|
|
SHR 9, 10, 6, 5, 32
|
|
ld 30, 128(4) # in[8].l
|
|
ld 31, 136(4) # in[8].h
|
|
addc 30, 30, 10
|
|
adde 31, 31, 9
|
|
|
|
# acc[7] += (in[12] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
ld 28, 112(4) # in[7].l
|
|
ld 29, 120(4) # in[7].h
|
|
addc 28, 28, 11
|
|
addze 29, 29
|
|
|
|
# acc[7] += in[12] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 28, 28, 10
|
|
adde 29, 29, 9
|
|
|
|
# acc[6] += (in[12] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 26, 26, 11
|
|
addze 27, 27
|
|
|
|
# acc[6] -= in[12] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 26, 10, 26
|
|
subfe 27, 9, 27
|
|
|
|
# acc[5] -= (in[12] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 24, 11, 24
|
|
subfe 25, 9, 25
|
|
|
|
# acc[6] += in[12] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 26, 26, 10
|
|
adde 27, 27, 9
|
|
|
|
# acc[5] += (in[12] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 24, 24, 11
|
|
addze 25, 25
|
|
|
|
# Eliminate in[11]
|
|
|
|
# acc[7] += in[11] >> 32;
|
|
ld 5, 176(4) # in[11].l
|
|
ld 6, 176+8(4) # in[11].h
|
|
SHR 9, 10, 6, 5, 32
|
|
addc 28, 28, 10
|
|
adde 29, 29, 9
|
|
|
|
# acc[6] += (in[11] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
addc 26, 26, 11
|
|
addze 27, 27
|
|
|
|
# acc[6] += in[11] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 26, 26, 10
|
|
adde 27, 27, 9
|
|
|
|
# acc[5] += (in[11] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 24, 24, 11
|
|
addze 25, 25
|
|
|
|
# acc[5] -= in[11] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 24, 10, 24
|
|
subfe 25, 9, 25
|
|
|
|
# acc[4] -= (in[11] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 22, 11, 22
|
|
subfe 23, 9, 23
|
|
|
|
# acc[5] += in[11] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 24, 24, 10
|
|
adde 25, 25, 9
|
|
|
|
# acc[4] += (in[11] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 22, 22, 11
|
|
addze 23, 23
|
|
|
|
# Eliminate in[10]
|
|
|
|
# acc[6] += in[10] >> 32;
|
|
ld 5, 160(4) # in[10].l
|
|
ld 6, 160+8(4) # in[10].h
|
|
SHR 9, 10, 6, 5, 32
|
|
addc 26, 26, 10
|
|
adde 27, 27, 9
|
|
|
|
# acc[5] += (in[10] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
addc 24, 24, 11
|
|
addze 25, 25
|
|
|
|
# acc[5] += in[10] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 24, 24, 10
|
|
adde 25, 25, 9
|
|
|
|
# acc[4] += (in[10] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 22, 22, 11
|
|
addze 23, 23
|
|
|
|
# acc[4] -= in[10] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 22, 10, 22
|
|
subfe 23, 9, 23
|
|
|
|
# acc[3] -= (in[10] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 20, 11, 20
|
|
subfe 21, 9, 21
|
|
|
|
# acc[4] += in[10] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 22, 22, 10
|
|
adde 23, 23, 9
|
|
|
|
# acc[3] += (in[10] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 20, 20, 11
|
|
addze 21, 21
|
|
|
|
# Eliminate in[9]
|
|
|
|
# acc[5] += in[9] >> 32;
|
|
ld 5, 144(4) # in[9].l
|
|
ld 6, 144+8(4) # in[9].h
|
|
SHR 9, 10, 6, 5, 32
|
|
addc 24, 24, 10
|
|
adde 25, 25, 9
|
|
|
|
# acc[4] += (in[9] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
addc 22, 22, 11
|
|
addze 23, 23
|
|
|
|
# acc[4] += in[9] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 22, 22, 10
|
|
adde 23, 23, 9
|
|
|
|
# acc[3] += (in[9] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 20, 20, 11
|
|
addze 21, 21
|
|
|
|
# acc[3] -= in[9] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 20, 10, 20
|
|
subfe 21, 9, 21
|
|
|
|
# acc[2] -= (in[9] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 18, 11, 18
|
|
subfe 19, 9, 19
|
|
|
|
# acc[3] += in[9] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 20, 20, 10
|
|
adde 21, 21, 9
|
|
|
|
# acc[2] += (in[9] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 18, 18, 11
|
|
addze 19, 19
|
|
|
|
# Eliminate acc[8]
|
|
|
|
# acc[4] += acc[8] >> 32;
|
|
mr 5, 30 # acc[8].l
|
|
mr 6, 31 # acc[8].h
|
|
SHR 9, 10, 6, 5, 32
|
|
addc 22, 22, 10
|
|
adde 23, 23, 9
|
|
|
|
# acc[3] += (acc[8] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
addc 20, 20, 11
|
|
addze 21, 21
|
|
|
|
# acc[3] += acc[8] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 20, 20, 10
|
|
adde 21, 21, 9
|
|
|
|
# acc[2] += (acc[8] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 18, 18, 11
|
|
addze 19, 19
|
|
|
|
# acc[2] -= acc[8] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 18, 10, 18
|
|
subfe 19, 9, 19
|
|
|
|
# acc[1] -= (acc[8] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 16, 11, 16
|
|
subfe 17, 9, 17
|
|
|
|
#acc[2] += acc[8] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 18, 18, 10
|
|
adde 19, 19, 9
|
|
|
|
# acc[1] += (acc[8] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 16, 16, 11
|
|
addze 17, 17
|
|
|
|
# Eliminate acc[7]
|
|
|
|
# acc[3] += acc[7] >> 32;
|
|
mr 5, 28 # acc[7].l
|
|
mr 6, 29 # acc[7].h
|
|
SHR 9, 10, 6, 5, 32
|
|
addc 20, 20, 10
|
|
adde 21, 21, 9
|
|
|
|
# acc[2] += (acc[7] & 0xffffffff) << 24;
|
|
srdi 11, 7, 32 # 0xffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 24 # << 24
|
|
addc 18, 18, 11
|
|
addze 19, 19
|
|
|
|
# acc[2] += acc[7] >> 8;
|
|
SHR 9, 10, 6, 5, 8
|
|
addc 18, 18, 10
|
|
adde 19, 19, 9
|
|
|
|
# acc[1] += (acc[7] & 0xff) << 48;
|
|
andi. 11, 5, 0xff
|
|
sldi 11, 11, 48
|
|
addc 16, 16, 11
|
|
addze 17, 17
|
|
|
|
# acc[1] -= acc[7] >> 16;
|
|
SHR 9, 10, 6, 5, 16
|
|
subfc 16, 10, 16
|
|
subfe 17, 9, 17
|
|
|
|
# acc[0] -= (acc[7] & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 40 # << 40
|
|
li 9, 0
|
|
subfc 14, 11, 14
|
|
subfe 15, 9, 15
|
|
|
|
# acc[1] += acc[7] >> 48;
|
|
SHR 9, 10, 6, 5, 48
|
|
addc 16, 16, 10
|
|
adde 17, 17, 9
|
|
|
|
# acc[0] += (acc[7] & 0xffffffffffff) << 8;
|
|
srdi 11, 7, 16 # 0xffffffffffff
|
|
and 11, 11, 5
|
|
sldi 11, 11, 8 # << 8
|
|
addc 14, 14, 11
|
|
addze 15, 15
|
|
|
|
#
|
|
# Carry 4 -> 5 -> 6
|
|
#
|
|
# acc[5] += acc[4] >> 56;
|
|
# acc[4] &= 0x00ffffffffffffff;
|
|
SHR 9, 10, 23, 22, 56
|
|
addc 24, 24, 10
|
|
adde 25, 25, 9
|
|
srdi 11, 7, 8 # 0x00ffffffffffffff
|
|
and 22, 22, 11
|
|
li 23, 0
|
|
|
|
# acc[6] += acc[5] >> 56;
|
|
# acc[5] &= 0x00ffffffffffffff;
|
|
SHR 9, 10, 25, 24, 56
|
|
addc 26, 26, 10
|
|
adde 27, 27, 9
|
|
and 24, 24, 11
|
|
li 25, 0
|
|
|
|
# [3]: Eliminate high bits of acc[6] */
|
|
# temp = acc[6] >> 48;
|
|
# acc[6] &= 0x0000ffffffffffff;
|
|
SHR 31, 30, 27, 26, 48 # temp = acc[6] >> 48
|
|
srdi 11, 7, 16 # 0x0000ffffffffffff
|
|
and 26, 26, 11
|
|
li 27, 0
|
|
|
|
# temp < 2^80
|
|
# acc[3] += temp >> 40;
|
|
SHR 9, 10, 31, 30, 40
|
|
addc 20, 20, 10
|
|
adde 21, 21, 9
|
|
|
|
# acc[2] += (temp & 0xffffffffff) << 16;
|
|
srdi 11, 7, 24 # 0xffffffffff
|
|
and 10, 30, 11
|
|
sldi 10, 10, 16
|
|
addc 18, 18, 10
|
|
addze 19, 19
|
|
|
|
# acc[2] += temp >> 16;
|
|
SHR 9, 10, 31, 30, 16
|
|
addc 18, 18, 10
|
|
adde 19, 19, 9
|
|
|
|
# acc[1] += (temp & 0xffff) << 40;
|
|
srdi 11, 7, 48 # 0xffff
|
|
and 10, 30, 11
|
|
sldi 10, 10, 40
|
|
addc 16, 16, 10
|
|
addze 17, 17
|
|
|
|
# acc[1] -= temp >> 24;
|
|
SHR 9, 10, 31, 30, 24
|
|
subfc 16, 10, 16
|
|
subfe 17, 9, 17
|
|
|
|
# acc[0] -= (temp & 0xffffff) << 32;
|
|
srdi 11, 7, 40 # 0xffffff
|
|
and 10, 30, 11
|
|
sldi 10, 10, 32
|
|
li 9, 0
|
|
subfc 14, 10, 14
|
|
subfe 15, 9, 15
|
|
|
|
# acc[0] += temp;
|
|
addc 14, 14, 30
|
|
adde 15, 15, 31
|
|
|
|
# Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6
|
|
#
|
|
# acc[1] += acc[0] >> 56; /* acc[1] < acc_old[1] + 2^72 */
|
|
SHR 9, 10, 15, 14, 56
|
|
addc 16, 16, 10
|
|
adde 17, 17, 9
|
|
|
|
# acc[0] &= 0x00ffffffffffffff;
|
|
srdi 11, 7, 8 # 0x00ffffffffffffff
|
|
and 14, 14, 11
|
|
li 15, 0
|
|
|
|
# acc[2] += acc[1] >> 56; /* acc[2] < acc_old[2] + 2^72 + 2^16 */
|
|
SHR 9, 10, 17, 16, 56
|
|
addc 18, 18, 10
|
|
adde 19, 19, 9
|
|
|
|
# acc[1] &= 0x00ffffffffffffff;
|
|
and 16, 16, 11
|
|
li 17, 0
|
|
|
|
# acc[3] += acc[2] >> 56; /* acc[3] < acc_old[3] + 2^72 + 2^16 */
|
|
SHR 9, 10, 19, 18, 56
|
|
addc 20, 20, 10
|
|
adde 21, 21, 9
|
|
|
|
# acc[2] &= 0x00ffffffffffffff;
|
|
and 18, 18, 11
|
|
li 19, 0
|
|
|
|
# acc[4] += acc[3] >> 56;
|
|
SHR 9, 10, 21, 20, 56
|
|
addc 22, 22, 10
|
|
adde 23, 23, 9
|
|
|
|
# acc[3] &= 0x00ffffffffffffff;
|
|
and 20, 20, 11
|
|
li 21, 0
|
|
|
|
# acc[5] += acc[4] >> 56;
|
|
SHR 9, 10, 23, 22, 56
|
|
addc 24, 24, 10
|
|
adde 25, 25, 9
|
|
|
|
# acc[4] &= 0x00ffffffffffffff;
|
|
and 22, 22, 11
|
|
|
|
# acc[6] += acc[5] >> 56;
|
|
SHR 9, 10, 25, 24, 56
|
|
addc 26, 26, 10
|
|
adde 27, 27, 9
|
|
|
|
# acc[5] &= 0x00ffffffffffffff;
|
|
and 24, 24, 11
|
|
|
|
std 14, 0(3)
|
|
std 16, 8(3)
|
|
std 18, 16(3)
|
|
std 20, 24(3)
|
|
std 22, 32(3)
|
|
std 24, 40(3)
|
|
std 26, 48(3)
|
|
blr
|
|
.size _p384_felem_reduce_core,.-_p384_felem_reduce_core
|
|
|
|
.data
|
|
.align 4
|
|
.LConst:
|
|
# two124m68:
|
|
.long 0x0, 0x0, 0xfffffff0, 0xfffffff
|
|
# two124m116m68:
|
|
.long 0x0, 0x0, 0xfffffff0, 0xfefffff
|
|
#two124p108m76:
|
|
.long 0x0, 0x0, 0xfffff000, 0x10000fff
|
|
#two124m92m68:
|
|
.long 0x0, 0x0, 0xeffffff0, 0xfffffff
|
|
|
|
.text
|
|
|
|
#
|
|
# void p384_felem_square_reduce(felem out, const felem in)
|
|
#
|
|
.global p384_felem_square_reduce
|
|
.type p384_felem_square_reduce,\@function
|
|
.align 4
|
|
p384_felem_square_reduce:
|
|
stdu 1, -512(1)
|
|
mflr 0
|
|
std 14, 56(1)
|
|
std 15, 64(1)
|
|
std 16, 72(1)
|
|
std 17, 80(1)
|
|
std 18, 88(1)
|
|
std 19, 96(1)
|
|
std 20, 104(1)
|
|
std 21, 112(1)
|
|
std 22, 120(1)
|
|
std 23, 128(1)
|
|
std 24, 136(1)
|
|
std 25, 144(1)
|
|
std 26, 152(1)
|
|
std 27, 160(1)
|
|
std 28, 168(1)
|
|
std 29, 176(1)
|
|
std 30, 184(1)
|
|
std 31, 192(1)
|
|
|
|
std 3, 496(1)
|
|
addi 3, 1, 208
|
|
bl _p384_felem_square_core
|
|
|
|
mr 4, 3
|
|
ld 3, 496(1)
|
|
bl _p384_felem_reduce_core
|
|
|
|
ld 14, 56(1)
|
|
ld 15, 64(1)
|
|
ld 16, 72(1)
|
|
ld 17, 80(1)
|
|
ld 18, 88(1)
|
|
ld 19, 96(1)
|
|
ld 20, 104(1)
|
|
ld 21, 112(1)
|
|
ld 22, 120(1)
|
|
ld 23, 128(1)
|
|
ld 24, 136(1)
|
|
ld 25, 144(1)
|
|
ld 26, 152(1)
|
|
ld 27, 160(1)
|
|
ld 28, 168(1)
|
|
ld 29, 176(1)
|
|
ld 30, 184(1)
|
|
ld 31, 192(1)
|
|
addi 1, 1, 512
|
|
mtlr 0
|
|
blr
|
|
.size p384_felem_square_reduce,.-p384_felem_square_reduce
|
|
|
|
#
|
|
# void p384_felem_mul_reduce(felem out, const felem in1, const felem in2)
|
|
#
|
|
.global p384_felem_mul_reduce
|
|
.type p384_felem_mul_reduce,\@function
|
|
.align 5
|
|
p384_felem_mul_reduce:
|
|
stdu 1, -512(1)
|
|
mflr 0
|
|
std 14, 56(1)
|
|
std 15, 64(1)
|
|
std 16, 72(1)
|
|
std 17, 80(1)
|
|
std 18, 88(1)
|
|
std 19, 96(1)
|
|
std 20, 104(1)
|
|
std 21, 112(1)
|
|
std 22, 120(1)
|
|
std 23, 128(1)
|
|
std 24, 136(1)
|
|
std 25, 144(1)
|
|
std 26, 152(1)
|
|
std 27, 160(1)
|
|
std 28, 168(1)
|
|
std 29, 176(1)
|
|
std 30, 184(1)
|
|
std 31, 192(1)
|
|
|
|
std 3, 496(1)
|
|
addi 3, 1, 208
|
|
bl _p384_felem_mul_core
|
|
|
|
mr 4, 3
|
|
ld 3, 496(1)
|
|
bl _p384_felem_reduce_core
|
|
|
|
ld 14, 56(1)
|
|
ld 15, 64(1)
|
|
ld 16, 72(1)
|
|
ld 17, 80(1)
|
|
ld 18, 88(1)
|
|
ld 19, 96(1)
|
|
ld 20, 104(1)
|
|
ld 21, 112(1)
|
|
ld 22, 120(1)
|
|
ld 23, 128(1)
|
|
ld 24, 136(1)
|
|
ld 25, 144(1)
|
|
ld 26, 152(1)
|
|
ld 27, 160(1)
|
|
ld 28, 168(1)
|
|
ld 29, 176(1)
|
|
ld 30, 184(1)
|
|
ld 31, 192(1)
|
|
addi 1, 1, 512
|
|
mtlr 0
|
|
blr
|
|
.size p384_felem_mul_reduce,.-p384_felem_mul_reduce
|
|
___
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT: $!";
|