mirror of https://github.com/alibaba/MNN.git
140 lines
2.9 KiB
ArmAsm
140 lines
2.9 KiB
ArmAsm
//
|
|
// MNNGelu.S
|
|
// MNN
|
|
//
|
|
// Created by MNN on 2023/2/27.
|
|
// Copyright © 2018, Alibaba Group Holding Limited
|
|
//
|
|
|
|
#ifdef __aarch64__
|
|
|
|
#include "MNNAsmGlobal.h"
|
|
.text
|
|
.align 5
|
|
|
|
asm_function MNNGelu
|
|
//void MNNGelu(float* dst, const float* src, size_t size, float* parameters);
|
|
|
|
//Auto Load:
|
|
//x0:dst, x1:src, x2:size, x3: parameters
|
|
|
|
stp d14, d15, [sp, #-64]!
|
|
stp d12, d13, [sp, #16]
|
|
stp d10, d11, [sp, #32]
|
|
stp d8, d9, [sp, #48]
|
|
|
|
cmp x2, #0
|
|
beq GeluEnd
|
|
|
|
ldr w4, [x3, #0] // w4, 0.044715f
|
|
ldr w5, [x3, #4] // w5, 0.79788458f
|
|
ldr w6, [x3, #8] // w6, 378.f
|
|
ldr w7, [x3, #12] // w7, 17325.f
|
|
ldr w8, [x3, #16] // w8, 135135.f
|
|
ldr w9, [x3, #20] // w9, 28.f
|
|
ldr w10, [x3, #24] // w10, 3150.f
|
|
ldr w11, [x3, #28] // w11, 62370.f
|
|
|
|
|
|
dup v15.4s, w4 // v15: [0.044715f]x4
|
|
dup v14.4s, w5 // v14: [0.79788458f]x4
|
|
dup v13.4s, w6 // v13: [378.f]x4
|
|
dup v12.4s, w7 // v12: [17325.f]x4
|
|
dup v11.4s, w8 // v11: [135135.f]x4
|
|
dup v10.4s, w9 // v10: [28.f]x4
|
|
dup v9.4s, w10 // v9: [3150.f]x4
|
|
dup v8.4s, w11 // v8: [62370.f]x4
|
|
|
|
mov w4, #5
|
|
mov w5, #-5
|
|
|
|
dup v30.4s, w4
|
|
dup v31.4s, w5
|
|
|
|
scvtf v30.4s, v30.4s
|
|
scvtf v31.4s, v31.4s
|
|
|
|
GeluZLoop:
|
|
|
|
ld1 {v0.4s, v1.4s}, [x1], #32 // v0, v1: fp32x4
|
|
|
|
fmul v2.4s, v0.4s, v0.4s
|
|
fmul v3.4s, v1.4s, v1.4s
|
|
fmul v2.4s, v2.4s, v0.4s
|
|
fmul v3.4s, v3.4s, v1.4s
|
|
|
|
fmul v2.4s, v2.4s, v15.4s
|
|
fadd v2.4s, v2.4s, v0.4s
|
|
fmul v3.4s, v3.4s, v15.4s
|
|
fadd v3.4s, v3.4s, v1.4s
|
|
|
|
fmul v2.4s, v2.4s, v14.4s
|
|
fmul v3.4s, v3.4s, v14.4s
|
|
|
|
// tanh(value)
|
|
fmax v2.4s, v31.4s, v2.4s
|
|
fmax v3.4s, v31.4s, v3.4s
|
|
fmin v2.4s, v30.4s, v2.4s
|
|
fmin v3.4s, v30.4s, v3.4s
|
|
|
|
fmul v4.4s, v2.4s, v2.4s // q4: value*value
|
|
fmul v5.4s, v3.4s, v3.4s // q5: value*value
|
|
// a
|
|
fadd v6.4s, v4.4s, v13.4s
|
|
fadd v7.4s, v5.4s, v13.4s
|
|
fmul v6.4s, v6.4s, v4.4s
|
|
fmul v7.4s, v7.4s, v5.4s
|
|
fadd v6.4s, v6.4s, v12.4s
|
|
fadd v7.4s, v7.4s, v12.4s
|
|
fmul v6.4s, v6.4s, v4.4s
|
|
fmul v7.4s, v7.4s, v5.4s
|
|
fadd v6.4s, v6.4s, v11.4s
|
|
fadd v7.4s, v7.4s, v11.4s
|
|
fmul v6.4s, v6.4s, v2.4s
|
|
fmul v7.4s, v7.4s, v3.4s
|
|
//b
|
|
fmul v2.4s, v4.4s, v10.4s
|
|
fmul v3.4s, v5.4s, v10.4s
|
|
fadd v2.4s, v2.4s, v9.4s
|
|
fadd v3.4s, v3.4s, v9.4s
|
|
fmul v2.4s, v2.4s, v4.4s
|
|
fmul v3.4s, v3.4s, v5.4s
|
|
fadd v2.4s, v2.4s, v8.4s
|
|
fadd v3.4s, v3.4s, v8.4s
|
|
fmul v2.4s, v2.4s, v4.4s
|
|
fmul v3.4s, v3.4s, v5.4s
|
|
fadd v2.4s, v2.4s, v11.4s
|
|
fadd v3.4s, v3.4s, v11.4s
|
|
//a/b
|
|
fdiv v6.4s, v6.4s, v2.4s
|
|
fdiv v7.4s, v7.4s, v3.4s
|
|
// border case
|
|
fmov v2.4s, #1.0
|
|
fmov v3.4s, #-1.0
|
|
fmov v4.4s, #0.5
|
|
fmin v6.4s, v6.4s, v2.4s
|
|
fmin v7.4s, v7.4s, v2.4s
|
|
fmax v6.4s, v6.4s, v3.4s
|
|
fmax v7.4s, v7.4s, v3.4s
|
|
// tanh(value)
|
|
|
|
fadd v6.4s, v6.4s, v2.4s
|
|
fadd v7.4s, v7.4s, v2.4s
|
|
fmul v6.4s, v6.4s, v0.4s
|
|
fmul v7.4s, v7.4s, v1.4s
|
|
fmul v6.4s, v6.4s, v4.4s
|
|
fmul v7.4s, v7.4s, v4.4s
|
|
|
|
st1 {v6.4s, v7.4s}, [x0], #32
|
|
|
|
subs x2, x2, #1
|
|
bne GeluZLoop
|
|
|
|
GeluEnd:
|
|
ldp d8, d9, [sp, #48]
|
|
ldp d10, d11, [sp, #32]
|
|
ldp d12, d13, [sp, #16]
|
|
ldp d14, d15, [sp], #64
|
|
ret
|
|
#endif
|