MNN/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARM...

1144 lines
35 KiB
ArmAsm

//
// MNNGemmInt8AddBiasScale_ARMV82_Unit.S
// MNN
//
// Created by MNN on 2019/12/17.
// Copyright © 2018, Alibaba Group Holding Limited
//
#if defined(__aarch64__)
#include "MNNAsmGlobal.h"
.text
.align 5
.macro ADD_BIAS_FLOAT d0, d1, d2, d3, z0
fadd \d0\().4s, \d0\().4s, \z0\().4s
fadd \d1\().4s, \d1\().4s, \z0\().4s
fadd \d2\().4s, \d2\().4s, \z0\().4s
fadd \d3\().4s, \d3\().4s, \z0\().4s
.endm
.macro ADD_FLOAT d0, d1, d2, d3, s0, s1, s2, s3
fadd \d0\().4s, \d0\().4s, \s0\().4s
fadd \d1\().4s, \d1\().4s, \s1\().4s
fadd \d2\().4s, \d2\().4s, \s2\().4s
fadd \d3\().4s, \d3\().4s, \s3\().4s
.endm
.macro SET_BIAS d0, d1, d2, d3
movi \d0\().16b, #0
movi \d1\().16b, #0
movi \d2\().16b, #0
movi \d3\().16b, #0
.endm
.macro Int32ToFloat z0, z1, z2, z3
scvtf \z0\().4s, \z0\().4s
scvtf \z1\().4s, \z1\().4s
scvtf \z2\().4s, \z2\().4s
scvtf \z3\().4s, \z3\().4s
.endm
.macro MUL_SCALE s, d0, d1, d2, d3
fmul \d0\().4s, \d0\().4s, \s\().4s
fmul \d1\().4s, \d1\().4s, \s\().4s
fmul \d2\().4s, \d2\().4s, \s\().4s
fmul \d3\().4s, \d3\().4s, \s\().4s
.endm
.macro MUL_EXTRA_SCALE s, d0, d1, d2, d3
fmul \d0\().4s, \d0\().4s, \s\().s[0]
fmul \d1\().4s, \d1\().4s, \s\().s[1]
fmul \d2\().4s, \d2\().4s, \s\().s[2]
fmul \d3\().4s, \d3\().4s, \s\().s[3]
.endm
.macro FloatToInt32 z0, z1, z2, z3
fcvtas \z0\().4s, \z0\().4s
fcvtas \z1\().4s, \z1\().4s
fcvtas \z2\().4s, \z2\().4s
fcvtas \z3\().4s, \z3\().4s
.endm
.macro Int32ToInt16 s0, s1, s2, s3, d0, d1
sqxtn \d0\().4h, \s0\().4s
sqxtn2 \d0\().8h, \s1\().4s
sqxtn \d1\().4h, \s2\().4s
sqxtn2 \d1\().8h, \s3\().4s
.endm
.macro Int16ToInt8_ONE s0, s1, d0
sqxtn \d0\().8b, \s0\().8h
sqxtn2 \d0\().16b, \s1\().8h
.endm
.macro Int16ToInt8 s0, s1, s2, s3, d0, d1
Int16ToInt8_ONE \s0, \s1, \d0
Int16ToInt8_ONE \s2, \s3, \d1
.endm
.macro MLA_WEIGHTZERO d0, s0, s1, idx // idx for xKernelSum
fmla \d0\().4s, \s1\().4s, \s0\().s[\idx]
.endm
.macro ReLU_FP32 s0, s1, s2, s3, z0, z1 // z0:min z1:max
fmin \s0\().4s, \s0\().4s, \z1\().4s
fmin \s1\().4s, \s1\().4s, \z1\().4s
fmin \s2\().4s, \s2\().4s, \z1\().4s
fmin \s3\().4s, \s3\().4s, \z1\().4s
fmax \s0\().4s, \s0\().4s, \z0\().4s
fmax \s1\().4s, \s1\().4s, \z0\().4s
fmax \s2\().4s, \s2\().4s, \z0\().4s
fmax \s3\().4s, \s3\().4s, \z0\().4s
.endm
asm_function MNNGemmInt8AddBiasScale_ARMV82_Unit
/*
struct QuanPostTreatParameters {
const float* scale;
const float* biasFloat;
int32_t maxValue;
int32_t minValue;
int32_t useInt8 = 1; // Save result as int8_t dataType; otherwise float32.
float roundValuePos = 0.5f;
float roundValueNeg = -0.5f;
float* srcKernelSum;
float* weightQuanBias;
float* fp32minmax;
ssize_t blockNum;
const int32_t* bias;
float* extraScale;
};
*/
//void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src,
// const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
// const QuanPostTreatParameters* parameters, size_t realDstCount);
//Auto: x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step
//x5:dst_depth_quad, x6: parameters, x7: realDstCount
//Load from x6: x8: scale, x9: bias, w28: useInt8, x25: xKernelSum, x26: weightQuantBias, x23: fp32minmax
// x24: extraScale
ldr x8, [x6, #0]
ldr x9, [x6, #8]
stp d14, d15, [sp, #(-16 * 9)]!
stp d12, d13, [sp, #(16 * 1)]
stp d10, d11, [sp, #(16 * 2)]
stp d8, d9, [sp, #(16 * 3)]
stp x21, x22, [sp, #(16 * 4)]
stp x19, x20, [sp, #(16 * 5)]
stp x27, x28, [sp, #(16 * 6)]
stp x25, x26, [sp, #(16 * 7)]
stp x23, x24, [sp, #(16 * 8)]
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
ldr w28, [x6, #24] // useInt8
ldr x25, [x6, #40] // xKernelSum
ldr x26, [x6, #48] // weightQuantBias
ldr x24, [x6, #80] // extraScale
add x23, x6, #16 // int8 max ptr
mov x21, #4 // sizeof(int8_t) * pack
cbnz w28, Start
mov x21, #16 // sizeof(float) * pack
ldr x23, [x6, #56] // fp32minmax
Start:
lsl x22, x7, #2 // eDest * SRC_UNIT
TILE_12:
cmp x7, #12
blt TILE_8
cmp x5, #2
blt L4LoopDz_TILE_12
L8LoopDz_TILE_12:
mov x11, x1
mov x13, x3
mov x20, x0 // tag dst address
mov x27, x2
SET_BIAS v8, v9, v10, v11
SET_BIAS v12, v13, v14, v15
SET_BIAS v16, v17, v18, v19
SET_BIAS v20, v21, v22, v23
SET_BIAS v24, v25, v26, v27
SET_BIAS v28, v29, v30, v31
L8LoopSz_TILE_12:
ld1 {v3.16b, v4.16b}, [x2], #32 // weight
ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
.inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
.inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
.inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
.inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
.inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
.inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
.inst 0x4f82e872 // sdot v18.4s, v3.16b, v2.4b[2]
.inst 0x4fa2e873 // sdot v19.4s, v3.16b, v2.4b[3]
.inst 0x4f80e094 // sdot v20.4s, v4.16b, v0.4b[0]
.inst 0x4fa0e095 // sdot v21.4s, v4.16b, v0.4b[1]
.inst 0x4f80e896 // sdot v22.4s, v4.16b, v0.4b[2]
.inst 0x4fa0e897 // sdot v23.4s, v4.16b, v0.4b[3]
.inst 0x4f81e098 // sdot v24.4s, v4.16b, v1.4b[0]
.inst 0x4fa1e099 // sdot v25.4s, v4.16b, v1.4b[1]
.inst 0x4f81e89a // sdot v26.4s, v4.16b, v1.4b[2]
.inst 0x4fa1e89b // sdot v27.4s, v4.16b, v1.4b[3]
subs x13, x13, #1
.inst 0x4f82e09c // sdot v28.4s, v4.16b, v2.4b[0]
.inst 0x4fa2e09d // sdot v29.4s, v4.16b, v2.4b[1]
.inst 0x4f82e89e // sdot v30.4s, v4.16b, v2.4b[2]
.inst 0x4fa2e89f // sdot v31.4s, v4.16b, v2.4b[3]
bne L8LoopSz_TILE_12
L8LoopSzEnd_TILE_12:
add x2, x27, x15
sub x5, x5, #2
L8Tile12Quan:
ld1 {v0.4s, v1.4s}, [x8], #32 // scale
ld1 {v2.4s, v3.4s, v4.4s}, [x25] // x kernel sum
ld1 {v5.4s, v6.4s}, [x26], #32 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
Int32ToFloat v12, v13, v14, v15
Int32ToFloat v16, v17, v18, v19
Int32ToFloat v20, v21, v22, v23
Int32ToFloat v24, v25, v26, v27
Int32ToFloat v28, v29, v30, v31
MUL_SCALE v0, v8, v9, v10, v11
MUL_SCALE v0, v12, v13, v14, v15
MUL_SCALE v0, v16, v17, v18, v19
MUL_SCALE v1, v20, v21, v22, v23
MUL_SCALE v1, v24, v25, v26, v27
MUL_SCALE v1, v28, v29, v30, v31
cbz x24, TILE12_L8_MLA
ld1 {v0.4s, v1.4s}, [x24], #32
ld1 {v7.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
MUL_EXTRA_SCALE v1, v12, v13, v14, v15
MUL_EXTRA_SCALE v7, v16, v17, v18, v19
MUL_EXTRA_SCALE v0, v20, v21, v22, v23
MUL_EXTRA_SCALE v1, v24, v25, v26, v27
MUL_EXTRA_SCALE v7, v28, v29, v30, v31
sub x24, x24, #32
TILE12_L8_MLA:
MLA_WEIGHTZERO v8, v2, v5, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v5, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v5, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v5, 3 // tile:3, oc:0-3
MLA_WEIGHTZERO v12, v3, v5, 0 // tile:4, oc:0-3
MLA_WEIGHTZERO v13, v3, v5, 1 // tile:5, oc:0-3
MLA_WEIGHTZERO v14, v3, v5, 2 // tile:6, oc:0-3
MLA_WEIGHTZERO v15, v3, v5, 3 // tile:7, oc:0-3
MLA_WEIGHTZERO v16, v4, v5, 0 // tile:8, oc:0-3
MLA_WEIGHTZERO v17, v4, v5, 1 // tile:9, oc:0-3
MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3
MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
MLA_WEIGHTZERO v20, v2, v6, 0 // tile:0, oc:4-7
MLA_WEIGHTZERO v21, v2, v6, 1 // tile:1, oc:4-7
MLA_WEIGHTZERO v22, v2, v6, 2 // tile:2, oc:4-7
MLA_WEIGHTZERO v23, v2, v6, 3 // tile:3, oc:4-7
MLA_WEIGHTZERO v24, v3, v6, 0 // tile:4, oc:4-7
MLA_WEIGHTZERO v25, v3, v6, 1 // tile:5, oc:4-7
MLA_WEIGHTZERO v26, v3, v6, 2 // tile:6, oc:4-7
MLA_WEIGHTZERO v27, v3, v6, 3 // tile:7, oc:4-7
MLA_WEIGHTZERO v28, v4, v6, 0 // tile:8, oc:4-7
MLA_WEIGHTZERO v29, v4, v6, 1 // tile:9, oc:4-7
MLA_WEIGHTZERO v30, v4, v6, 2 // tile:10, oc:4-7
MLA_WEIGHTZERO v31, v4, v6, 3 // tile:11, oc:4-7
cmp w28, #1
beq L8Tile12QuanUseInt8
sub x4, x4, #128
cbz x9, TILE12_ADD_DSTV
TILE12_ADD_BIAS:
ld1 {v0.4s, v1.4s}, [x9], #32
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v0
ADD_BIAS_FLOAT v20, v21, v22, v23, v1
ADD_BIAS_FLOAT v24, v25, v26, v27, v1
ADD_BIAS_FLOAT v28, v29, v30, v31, v1
b TILE12_POST
TILE12_ADD_DSTV:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x20], #64
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x20], #64
ADD_FLOAT v8, v9, v10, v11, v0, v1, v2, v3
ADD_FLOAT v12, v13, v14, v15, v4, v5, v6, v7
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x20], x4
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x20], #64
ADD_FLOAT v16, v17, v18, v19, v0, v1, v2, v3
ADD_FLOAT v20, v21, v22, v23, v4, v5, v6, v7
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x20], #64
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x20]
ADD_FLOAT v24, v25, v26, v27, v0, v1, v2, v3
ADD_FLOAT v28, v29, v30, v31, v4, v5, v6, v7
TILE12_POST:
cbz x23, TILE12_STORE
ld1r {v0.4s}, [x23], #4 // f32 min
ld1r {v1.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v0, v1
ReLU_FP32 v12, v13, v14, v15, v0, v1
ReLU_FP32 v16, v17, v18, v19, v0, v1
ReLU_FP32 v20, v21, v22, v23, v0, v1
ReLU_FP32 v24, v25, v26, v27, v0, v1
ReLU_FP32 v28, v29, v30, v31, v0, v1
sub x23, x23, #4
TILE12_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x4
add x4, x4, #128
b L8Tile12LoopCheck
L8Tile12QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v0.4s, v1.4s}, [x9], #32
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v0
ADD_BIAS_FLOAT v20, v21, v22, v23, v1
ADD_BIAS_FLOAT v24, v25, v26, v27, v1
ADD_BIAS_FLOAT v28, v29, v30, v31, v1
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
FloatToInt32 v12, v13, v14, v15
FloatToInt32 v16, v17, v18, v19
FloatToInt32 v20, v21, v22, v23
FloatToInt32 v24, v25, v26, v27
FloatToInt32 v28, v29, v30, v31
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int32ToInt16 v12, v13, v14, v15, v2, v3
Int32ToInt16 v16, v17, v18, v19, v4, v5
Int32ToInt16 v20, v21, v22, v23, v8, v9
Int32ToInt16 v24, v25, v26, v27, v10, v11
Int32ToInt16 v28, v29, v30, v31, v12, v13
Int16ToInt8 v0, v1, v2, v3, v16, v17
Int16ToInt8 v4, v5, v8, v9, v18, v19
Int16ToInt8 v10, v11, v12, v13, v20, v21
smax v16.16b, v6.16b, v16.16b
smax v17.16b, v6.16b, v17.16b
smax v18.16b, v6.16b, v18.16b
smax v19.16b, v6.16b, v19.16b
smax v20.16b, v6.16b, v20.16b
smax v21.16b, v6.16b, v21.16b
smin v16.16b, v7.16b, v16.16b
smin v17.16b, v7.16b, v17.16b
smin v18.16b, v7.16b, v18.16b
smin v19.16b, v7.16b, v19.16b
smin v20.16b, v7.16b, v20.16b
smin v21.16b, v7.16b, v21.16b
st1 {v16.16b, v17.16b, v18.16b}, [x0], x4
st1 {v19.16b, v20.16b, v21.16b}, [x0], x4
L8Tile12LoopCheck:
cmp x5, #1
bgt L8LoopDz_TILE_12
cbz x5, End
L4LoopDz_TILE_12:
SET_BIAS v8, v9, v10, v11
SET_BIAS v12, v13, v14, v15
SET_BIAS v16, v17, v18, v19
L4LoopSz_TILE_12:
ld1 {v3.16b}, [x2] // weight
ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
.inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
.inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
.inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
.inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
add x2, x2, #32 // weight offset=lp*hp=32
subs x3, x3, #1
.inst 0x4f82e070 // sdot v16.4s, v3.16b, v2.4b[0]
.inst 0x4fa2e071 // sdot v17.4s, v3.16b, v2.4b[1]
.inst 0x4f82e872 // sdot v18.4s, v3.16b, v2.4b[2]
.inst 0x4fa2e873 // sdot v19.4s, v3.16b, v2.4b[3]
bne L4LoopSz_TILE_12
L4LoopSzEnd_TILE_12:
L4Tile12Quan:
ld1 {v0.4s}, [x8] // scale
ld1 {v2.4s, v3.4s, v4.4s}, [x25]// x kernel sum
ld1 {v5.4s}, [x26], #16 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
Int32ToFloat v12, v13, v14, v15
Int32ToFloat v16, v17, v18, v19
MUL_SCALE v0, v8, v9, v10, v11
MUL_SCALE v0, v12, v13, v14, v15
MUL_SCALE v0, v16, v17, v18, v19
cbz x24, TILE12_L4_MLA
ld1 {v0.4s, v1.4s}, [x24], #32
ld1 {v7.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
MUL_EXTRA_SCALE v1, v12, v13, v14, v15
MUL_EXTRA_SCALE v7, v16, v17, v18, v19
sub x24, x24, #32
TILE12_L4_MLA:
MLA_WEIGHTZERO v8, v2, v5, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v5, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v5, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v5, 3 // tile:3, oc:0-3
MLA_WEIGHTZERO v12, v3, v5, 0 // tile:4, oc:0-3
MLA_WEIGHTZERO v13, v3, v5, 1 // tile:5, oc:0-3
MLA_WEIGHTZERO v14, v3, v5, 2 // tile:6, oc:0-3
MLA_WEIGHTZERO v15, v3, v5, 3 // tile:7, oc:0-3
MLA_WEIGHTZERO v16, v4, v5, 0 // tile:8, oc:0-3
MLA_WEIGHTZERO v17, v4, v5, 1 // tile:9, oc:0-3
MLA_WEIGHTZERO v18, v4, v5, 2 // tile:10, oc:0-3
MLA_WEIGHTZERO v19, v4, v5, 3 // tile:11, oc:0-3
cmp w28, #1
beq L4Tile12QuanUseInt8
sub x4, x4, #128
TILE12_L4_ADD_BIAS:
cbz x9, TILE12_L4_ADD_DSTV
ld1 {v0.4s}, [x9] // bias
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v0
b TILE12_L4_POST
TILE12_L4_ADD_DSTV:
ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0]
sub x0, x0, #128
ADD_FLOAT v8, v9, v10, v11, v20, v21, v22, v23
ADD_FLOAT v12, v13, v14, v15, v24, v25, v26, v27
ADD_FLOAT v16, v17, v18, v19, v28, v29, v30, v31
TILE12_L4_POST:
cbz x23, TILE12_L4_STORE
ld1r {v6.4s}, [x23], #4 // f32 min
ld1r {v7.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v6, v7
ReLU_FP32 v12, v13, v14, v15, v6, v7
ReLU_FP32 v16, v17, v18, v19, v6, v7
sub x23, x23, #4
TILE12_L4_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4
add x4, x4, #128
b End
L4Tile12QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v0.4s}, [x9] // bias
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v0
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
FloatToInt32 v12, v13, v14, v15
FloatToInt32 v16, v17, v18, v19
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int32ToInt16 v12, v13, v14, v15, v2, v3
Int32ToInt16 v16, v17, v18, v19, v4, v5
Int16ToInt8 v0, v1, v2, v3, v16, v17
Int16ToInt8_ONE v4, v5, v18
smax v16.16b, v6.16b, v16.16b
smax v17.16b, v6.16b, v17.16b
smax v18.16b, v6.16b, v18.16b
smin v16.16b, v7.16b, v16.16b
smin v17.16b, v7.16b, v17.16b
smin v18.16b, v7.16b, v18.16b
st1 {v16.16b, v17.16b, v18.16b}, [x0], x4
b End
TILE_8:
cmp x7, #8
blt TILE_4
mov x10, x0
mov x12, x2
mov x14, x5
mov x19, x8 // scale
mov x20, x9 // bias
mov x6, x26 // weightQuantBias
cmp x5, #2
blt L4LoopDz_TILE_8
L8LoopDz_TILE_8:
mov x11, x1
mov x13, x3
mov x27, x12
SET_BIAS v8, v9, v10, v11
SET_BIAS v12, v13, v14, v15
SET_BIAS v16, v17, v18, v19
SET_BIAS v20, v21, v22, v23
L8LoopSz_TILE_8:
ld1 {v3.16b, v4.16b}, [x12], #32 // weight
ld1 {v0.16b, v1.16b}, [x11], x22 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
.inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
.inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
.inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
.inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
.inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]
.inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]
.inst 0x4f80e892 // sdot v18.4s, v4.16b, v0.4b[2]
.inst 0x4fa0e893 // sdot v19.4s, v4.16b, v0.4b[3]
subs x13, x13, #1
.inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]
.inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]
.inst 0x4f81e896 // sdot v22.4s, v4.16b, v1.4b[2]
.inst 0x4fa1e897 // sdot v23.4s, v4.16b, v1.4b[3]
bne L8LoopSz_TILE_8
L8LoopSzEnd_TILE_8:
add x12, x27, x15
sub x14, x14, #2
L8Tile8Quan:
ld1 {v0.4s, v1.4s}, [x19], #32 // scale
ld1 {v2.4s, v3.4s}, [x25] // x kernel sum
ld1 {v24.4s, v25.4s}, [x6], #32 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
Int32ToFloat v12, v13, v14, v15
Int32ToFloat v16, v17, v18, v19
Int32ToFloat v20, v21, v22, v23
MUL_SCALE v0, v8, v9, v10, v11
MUL_SCALE v0, v12, v13, v14, v15
MUL_SCALE v1, v16, v17, v18, v19
MUL_SCALE v1, v20, v21, v22, v23
cbz x24, TILE8_L8_MLA
ld1 {v0.4s, v1.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
MUL_EXTRA_SCALE v1, v12, v13, v14, v15
MUL_EXTRA_SCALE v0, v16, v17, v18, v19
MUL_EXTRA_SCALE v1, v20, v21, v22, v23
TILE8_L8_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v24, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v24, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v24, 3 // tile:3, oc:0-3
MLA_WEIGHTZERO v12, v3, v24, 0 // tile:4, oc:0-3
MLA_WEIGHTZERO v13, v3, v24, 1 // tile:5, oc:0-3
MLA_WEIGHTZERO v14, v3, v24, 2 // tile:6, oc:0-3
MLA_WEIGHTZERO v15, v3, v24, 3 // tile:7, oc:0-3
MLA_WEIGHTZERO v16, v2, v25, 0 // tile:0, oc:4-7
MLA_WEIGHTZERO v17, v2, v25, 1 // tile:1, oc:4-7
MLA_WEIGHTZERO v18, v2, v25, 2 // tile:2, oc:4-7
MLA_WEIGHTZERO v19, v2, v25, 3 // tile:3, oc:4-7
MLA_WEIGHTZERO v20, v3, v25, 0 // tile:4, oc:4-7
MLA_WEIGHTZERO v21, v3, v25, 1 // tile:5, oc:4-7
MLA_WEIGHTZERO v22, v3, v25, 2 // tile:6, oc:4-7
MLA_WEIGHTZERO v23, v3, v25, 3 // tile:7, oc:4-7
cmp w28, #1
beq L8Tile8QuanUseInt8
sub x4, x4, #64
cbz x9, TILE8_ADD_DSTV
TILE8_ADD_BIAS:
ld1 {v0.4s, v1.4s}, [x20], #32
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v1
ADD_BIAS_FLOAT v20, v21, v22, v23, v1
b TILE8_POST
TILE8_ADD_DSTV:
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x10], #64
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], x4
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x10], #64
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x10]
ADD_FLOAT v8, v9, v10, v11, v0, v1, v2, v3
ADD_FLOAT v12, v13, v14, v15, v4, v5, v6, v7
ADD_FLOAT v16, v17, v18, v19, v24, v25, v26, v27
ADD_FLOAT v20, v21, v22, v23, v28, v29, v30, v31
sub x10, x10, #128
sub x10, x10, x4
TILE8_POST:
cbz x23, TILE8_STORE
ld1r {v0.4s}, [x23], #4 // f32 min
ld1r {v1.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v0, v1
ReLU_FP32 v12, v13, v14, v15, v0, v1
ReLU_FP32 v16, v17, v18, v19, v0, v1
ReLU_FP32 v20, v21, v22, v23, v0, v1
sub x23, x23, #4
TILE8_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], #64
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4
add x4, x4, #64
b L8Tile8LoopCheck
L8Tile8QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v0.4s, v1.4s}, [x20], #32
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v0
ADD_BIAS_FLOAT v12, v13, v14, v15, v0
ADD_BIAS_FLOAT v16, v17, v18, v19, v1
ADD_BIAS_FLOAT v20, v21, v22, v23, v1
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
FloatToInt32 v12, v13, v14, v15
FloatToInt32 v16, v17, v18, v19
FloatToInt32 v20, v21, v22, v23
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int32ToInt16 v12, v13, v14, v15, v2, v3
Int32ToInt16 v16, v17, v18, v19, v4, v5
Int32ToInt16 v20, v21, v22, v23, v8, v9
Int16ToInt8 v0, v1, v2, v3, v16, v17
Int16ToInt8 v4, v5, v8, v9, v18, v19
smax v16.16b, v6.16b, v16.16b
smax v17.16b, v6.16b, v17.16b
smax v18.16b, v6.16b, v18.16b
smax v19.16b, v6.16b, v19.16b
smin v16.16b, v7.16b, v16.16b
smin v17.16b, v7.16b, v17.16b
smin v18.16b, v7.16b, v18.16b
smin v19.16b, v7.16b, v19.16b
st1 {v16.16b, v17.16b}, [x10], x4
st1 {v18.16b, v19.16b}, [x10], x4
L8Tile8LoopCheck:
cmp x14, #1
bgt L8LoopDz_TILE_8
cbz x14, Tile8End
L4LoopDz_TILE_8:
mov x11, x1
mov x13, x3
SET_BIAS v8, v9, v10, v11
SET_BIAS v12, v13, v14, v15
L4LoopSz_TILE_8:
ld1 {v3.16b}, [x12] // weight
ld1 {v0.16b, v1.16b}, [x11], x22 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
add x12, x12, #32 // weight offset=lp*hp
subs x13, x13, #1
.inst 0x4f81e06c // sdot v12.4s, v3.16b, v1.4b[0]
.inst 0x4fa1e06d // sdot v13.4s, v3.16b, v1.4b[1]
.inst 0x4f81e86e // sdot v14.4s, v3.16b, v1.4b[2]
.inst 0x4fa1e86f // sdot v15.4s, v3.16b, v1.4b[3]
bne L4LoopSz_TILE_8
L4LoopSzEnd_TILE_8:
L4Tile8Quan:
ld1 {v0.4s}, [x19], #16 // scale
ld1 {v2.4s, v3.4s}, [x25] // x kernel sum
ld1 {v24.4s}, [x6], #16 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
Int32ToFloat v12, v13, v14, v15
MUL_SCALE v0, v8, v9, v10, v11
MUL_SCALE v0, v12, v13, v14, v15
cbz x24, TILE8_L4_MLA
ld1 {v0.4s, v1.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
MUL_EXTRA_SCALE v1, v12, v13, v14, v15
TILE8_L4_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v24, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v24, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v24, 3 // tile:3, oc:0-3
MLA_WEIGHTZERO v12, v3, v24, 0 // tile:4, oc:0-3
MLA_WEIGHTZERO v13, v3, v24, 1 // tile:5, oc:0-3
MLA_WEIGHTZERO v14, v3, v24, 2 // tile:6, oc:0-3
MLA_WEIGHTZERO v15, v3, v24, 3 // tile:7, oc:0-3
cmp w28, #1
beq L4Tile8QuanUseInt8
sub x4, x4, #64
cbz x9, TILE8_L4_ADD_DSTV
TILE8_L4_ADD_BIAS:
ld1 {v4.4s}, [x20], #16
ADD_BIAS_FLOAT v8, v9, v10, v11, v4
ADD_BIAS_FLOAT v12, v13, v14, v15, v4
b TILE8_L4_POST
TILE8_L4_ADD_DSTV:
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], #64
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10]
sub x10, x10, #64
ADD_FLOAT v8, v9, v10, v11, v4, v5, v6, v7
ADD_FLOAT v12, v13, v14, v15, v16, v17, v18, v19
TILE8_L4_POST:
cbz x23, TILE8_L4_STORE
ld1r {v0.4s}, [x23], #4 // f32 min
ld1r {v1.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v0, v1
ReLU_FP32 v12, v13, v14, v15, v0, v1
sub x23, x23, #4
TILE8_L4_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
add x4, x4, #64
b Tile8End
L4Tile8QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v4.4s}, [x20], #16
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v4
ADD_BIAS_FLOAT v12, v13, v14, v15, v4
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
FloatToInt32 v12, v13, v14, v15
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int32ToInt16 v12, v13, v14, v15, v2, v3
Int16ToInt8 v0, v1, v2, v3, v16, v17
smax v16.16b, v6.16b, v16.16b
smax v17.16b, v6.16b, v17.16b
smin v16.16b, v7.16b, v16.16b
smin v17.16b, v7.16b, v17.16b
st1 {v16.16b, v17.16b}, [x10], x4
Tile8End:
cbz x24, Tile8_End_Offset
add x24, x24, #32
Tile8_End_Offset:
sub x7, x7, #8
add x0, x0, x21, LSL #3
add x1, x1, #32
add x25, x25, #32
TILE_4:
cmp x7, #4
blt TILE_1
mov x10, x0
mov x12, x2
mov x14, x5
mov x19, x8
mov x20, x9
mov x6, x26 // weightQuantBias
cmp x5, #2
blt L4LoopDz_TILE_4
L8LoopDz_TILE_4:
//ld1 {v0.4s, v1.4s}, [x20], #32 // bias
mov x11, x1
mov x13, x3
mov x27, x12
SET_BIAS v8, v9, v10, v11
SET_BIAS v12, v13, v14, v15
L8LoopSz_TILE_4:
ld1 {v3.16b, v4.16b}, [x12], #32 // weight
ld1 {v0.16b}, [x11], x22 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
subs x13, x13, #1
.inst 0x4f80e08c // sdot v12.4s, v4.16b, v0.4b[0]
.inst 0x4fa0e08d // sdot v13.4s, v4.16b, v0.4b[1]
.inst 0x4f80e88e // sdot v14.4s, v4.16b, v0.4b[2]
.inst 0x4fa0e88f // sdot v15.4s, v4.16b, v0.4b[3]
bne L8LoopSz_TILE_4
L8LoopSzEnd_TILE_4:
add x12, x27, x15
sub x14, x14, #2
L8Tile4Quan:
ld1 {v0.4s, v1.4s}, [x19], #32 // scale
ld1 {v2.4s}, [x25] // x kernel sum
ld1 {v24.4s, v25.4s}, [x6], #32 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
Int32ToFloat v12, v13, v14, v15
MUL_SCALE v0, v8, v9, v10, v11
MUL_SCALE v1, v12, v13, v14, v15
cbz x24, TILE4_L8_MLA
ld1 {v0.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
MUL_EXTRA_SCALE v0, v12, v13, v14, v15
TILE4_L8_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v24, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v24, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v24, 3 // tile:3, oc:0-3
MLA_WEIGHTZERO v12, v2, v25, 0 // tile:0, oc:4-7
MLA_WEIGHTZERO v13, v2, v25, 1 // tile:1, oc:4-7
MLA_WEIGHTZERO v14, v2, v25, 2 // tile:2, oc:4-7
MLA_WEIGHTZERO v15, v2, v25, 3 // tile:3, oc:4-7
cmp w28, #1
beq L8Tile4QuanUseInt8
cbz x9, TILE4_ADD_DSTV
TILE4_ADD_BIAS:
ld1 {v4.4s, v5.4s}, [x20], #32
ADD_BIAS_FLOAT v8, v9, v10, v11, v4
ADD_BIAS_FLOAT v12, v13, v14, v15, v5
b TILE4_POST
TILE4_ADD_DSTV:
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x10], x4
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10]
sub x10, x10, x4
ADD_FLOAT v8, v9, v10, v11, v4, v5, v6, v7
ADD_FLOAT v12, v13, v14, v15, v16, v17, v18, v19
TILE4_POST:
cbz x23, TILE4_STORE
ld1r {v26.4s}, [x23], #4 // f32 min
ld1r {v27.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v26, v27
ReLU_FP32 v12, v13, v14, v15, v26, v27
sub x23, x23, #4
TILE4_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
b L8Tile4LoopCheck
L8Tile4QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v4.4s, v5.4s}, [x20], #32
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v4
ADD_BIAS_FLOAT v12, v13, v14, v15, v5
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
FloatToInt32 v12, v13, v14, v15
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int32ToInt16 v12, v13, v14, v15, v2, v3
Int16ToInt8 v0, v1, v2, v3, v16, v17
smax v16.16b, v6.16b, v16.16b
smax v17.16b, v6.16b, v17.16b
smin v16.16b, v7.16b, v16.16b
smin v17.16b, v7.16b, v17.16b
st1 {v16.16b}, [x10], x4
st1 {v17.16b}, [x10], x4
L8Tile4LoopCheck:
cmp x14, #1
bgt L8LoopDz_TILE_4
cbz x14, Tile4End
L4LoopDz_TILE_4:
mov x11, x1
mov x13, x3
SET_BIAS v8, v9, v10, v11
L4LoopSz_TILE_4:
ld1 {v3.16b}, [x12] // weight
ld1 {v0.16b}, [x11], x22 // src
subs x13, x13, #1
add x12, x12, #32 // weight offset = lp*hp
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
.inst 0x4fa0e069 // sdot v9.4s, v3.16b, v0.4b[1]
.inst 0x4f80e86a // sdot v10.4s, v3.16b, v0.4b[2]
.inst 0x4fa0e86b // sdot v11.4s, v3.16b, v0.4b[3]
bne L4LoopSz_TILE_4
L4LoopSzEnd_TILE_4:
L4Tile4Quan:
ld1 {v0.4s}, [x19], #16 // scale
ld1 {v2.4s}, [x25] // x kernel sum
ld1 {v24.4s}, [x6], #16 // weight quan zeropoint
Int32ToFloat v8, v9, v10, v11
MUL_SCALE v0, v8, v9, v10, v11
cbz x24, TILE4_L4_MLA
ld1 {v0.4s}, [x24]
MUL_EXTRA_SCALE v0, v8, v9, v10, v11
TILE4_L4_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v24, 1 // tile:1, oc:0-3
MLA_WEIGHTZERO v10, v2, v24, 2 // tile:2, oc:0-3
MLA_WEIGHTZERO v11, v2, v24, 3 // tile:3, oc:0-3
cmp w28, #1
beq L4Tile4QuanUseInt8
cbz x9, TILE4_L4_ADD_DSTV
TILE4_L4_ADD_BIAS:
ld1 {v3.4s}, [x20], #16
ADD_BIAS_FLOAT v8, v9, v10, v11, v3
b TILE4_L4_POST
TILE4_L4_ADD_DSTV:
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10]
ADD_FLOAT v8, v9, v10, v11, v12, v13, v14, v15
TILE4_L4_POST:
cbz x23, TILE4_L4_STORE
ld1r {v26.4s}, [x23], #4 // f32 min
ld1r {v27.4s}, [x23] // f32 max
ReLU_FP32 v8, v9, v10, v11, v26, v27
sub x23, x23, #4
TILE4_L4_STORE:
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4
b Tile4End
L4Tile4QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v3.4s}, [x20], #16
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
ADD_BIAS_FLOAT v8, v9, v10, v11, v3
sub x23, x23, #4
FloatToInt32 v8, v9, v10, v11
Int32ToInt16 v8, v9, v10, v11, v0, v1
Int16ToInt8_ONE v0, v1, v16
smax v16.16b, v6.16b, v16.16b
smin v16.16b, v7.16b, v16.16b
st1 {v16.16b}, [x10], x4
Tile4End:
cbz x24, Tile4_End_Offset
add x24, x24, #16
Tile4_End_Offset:
sub x7, x7, #4
add x0, x0, x21, LSL #2
add x1, x1, #16
add x25, x25, #16
TILE_1:
cbz x7, End
mov x10, x0
mov x12, x2
mov x14, x5
mov x19, x8
mov x20, x9
mov x6, x26 // weightQuantBias
cmp x5, #2
blt L4LoopDz_TILE_1
L8LoopDz_TILE_1:
mov x11, x1
mov x13, x3
mov x27, x12
movi v8.16b, #0
movi v9.16b, #0
L8LoopSz_TILE_1:
ld1 {v3.16b, v4.16b}, [x12], #32 // weight
ld1 {v0.s}[0], [x11], x22 // src
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
subs x13, x13, #1
.inst 0x4f80e089 // sdot v9.4s, v4.16b, v0.4b[0]
bne L8LoopSz_TILE_1
L8LoopSzEnd_TILE_1:
add x12, x27, x15
sub x14, x14, #2
L8Tile1Quan:
ld1 {v0.4s, v1.4s}, [x19], #32 // scale
ld1 {v2.s}[0], [x25] // x kernel sum
ld1 {v24.4s, v25.4s}, [x6], #32 // weight quan zeropoint
scvtf v8.4s, v8.4s
scvtf v9.4s, v9.4s
fmul v8.4s, v8.4s, v0.4s
fmul v9.4s, v9.4s, v1.4s
cbz x24, TILE1_L8_MLA
ld1 {v0.s}[0], [x24]
fmul v8.4s, v8.4s, v0.s[0]
fmul v9.4s, v9.4s, v0.s[0]
TILE1_L8_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
MLA_WEIGHTZERO v9, v2, v25, 0 // tile:0, oc:4-7
cmp w28, #1
beq L8Tile1QuanUseInt8
cbz x9, TILE1_ADD_DSTV
TILE1_ADD_BIAS:
ld1 {v10.4s, v11.4s}, [x20], #32
fadd v8.4s, v8.4s, v10.4s
fadd v9.4s, v9.4s, v11.4s
b TILE1_POST
TILE1_ADD_DSTV:
ld1 {v10.4s}, [x10], x4
ld1 {v11.4s}, [x10]
sub x10, x10, x4
fadd v8.4s, v8.4s, v10.4s
fadd v9.4s, v9.4s, v11.4s
TILE1_POST:
cbz x23, TILE1_STORE
ld1r {v26.4s}, [x23], #4 // f32 min
ld1r {v27.4s}, [x23] // f32 max
sub x23, x23, #4
fmin v8.4s, v8.4s, v27.4s
fmin v9.4s, v9.4s, v27.4s
fmax v8.4s, v8.4s, v26.4s
fmax v9.4s, v9.4s, v26.4s
TILE1_STORE:
st1 {v8.4s}, [x10], x4
st1 {v9.4s}, [x10], x4
b L8Tile1LoopCheck
L8Tile1QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v10.4s, v11.4s}, [x20], #32
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
fadd v8.4s, v8.4s, v10.4s
fadd v9.4s, v9.4s, v11.4s
sub x23, x23, #4
fcvtas v8.4s, v8.4s
fcvtas v9.4s, v9.4s
sqxtn v0.4h, v8.4s
sqxtn2 v0.8h, v9.4s
sqxtn v16.8b, v0.8h
smax v16.16b, v6.16b, v16.16b
smin v16.16b, v7.16b, v16.16b
st1 {v16.s}[0], [x10], x4
st1 {v16.s}[1], [x10], x4
L8Tile1LoopCheck:
cmp x14, #1
bgt L8LoopDz_TILE_1
cbz x14, Tile1End
L4LoopDz_TILE_1:
mov x11, x1
mov x13, x3
movi v8.16b, #0
L4LoopSz_TILE_1:
ld1 {v3.16b}, [x12] // weight
ld1 {v0.s}[0], [x11], x22 // src
subs x13, x13, #1
add x12, x12, #32 // weight offset = lp*hp
.inst 0x4f80e068 // sdot v8.4s, v3.16b, v0.4b[0]
bne L4LoopSz_TILE_1
L4LoopSzEnd_TILE_1:
L4Tile1Quan:
ld1 {v0.4s}, [x19], #16 // scale
ld1 {v2.s}[0], [x25] // x kernel sum
ld1 {v24.4s}, [x6], #16 // weight quan zeropoint
scvtf v8.4s, v8.4s
fmul v8.4s, v8.4s, v0.4s
cbz x24, TILE1_L4_MLA
ld1 {v0.s}[0], [x24]
fmul v8.4s, v8.4s, v0.s[0]
TILE1_L4_MLA:
MLA_WEIGHTZERO v8, v2, v24, 0 // tile:0, oc:0-3
cmp w28, #1
beq L4Tile1QuanUseInt8
cbz x9, TILE1_L4_ADD_DSTV
TILE1_L4_ADD_BIAS:
ld1 {v4.4s}, [x20], #16
fadd v8.4s, v8.4s, v4.4s
b TILE1_L4_POST
TILE1_L4_ADD_DSTV:
ld1 {v4.4s}, [x10]
fadd v8.4s, v8.4s, v4.4s
TILE1_L4_POST:
cbz x23, TILE1_L4_STORE
ld1r {v26.4s}, [x23], #4 // f32 min
ld1r {v27.4s}, [x23] // f32 max
sub x23, x23, #4
fmax v8.4s, v8.4s, v26.4s
fmin v8.4s, v8.4s, v27.4s
TILE1_L4_STORE:
st1 {v8.4s}, [x10], x4
b Tile1End
L4Tile1QuanUseInt8:
ld1r {v7.4s}, [x23], #4 // int8 max
ld1r {v6.4s}, [x23] // int8 min
ld1 {v4.4s}, [x20], #16
fadd v8.4s, v8.4s, v4.4s
sub x23, x23, #4
dup v7.16b, v7.b[0]
dup v6.16b, v6.b[0]
fcvtas v8.4s, v8.4s
sqxtn v0.4h, v8.4s
sqxtn v16.8b, v0.8h
smax v16.8b, v6.8b, v16.8b
smin v16.8b, v7.8b, v16.8b
st1 {v16.s}[0], [x10], x4
Tile1End:
cbz x24, Tile1_End_Offset
add x24, x24, #4
Tile1_End_Offset:
subs x7, x7, #1
add x0, x0, x21
add x1, x1, #4
add x25, x25, #4
bne TILE_1
End:
ldp x23, x24, [sp, #(16 * 8)]
ldp x25, x26, [sp, #(16 * 7)]
ldp x27, x28, [sp, #(16 * 6)]
ldp x19, x20, [sp, #(16 * 5)]
ldp x21, x22, [sp, #(16 * 4)]
ldp d8, d9, [sp, #(16 * 3)]
ldp d10, d11, [sp, #(16 * 2)]
ldp d12, d13, [sp, #(16 * 1)]
ldp d14, d15, [sp], #(16 * 9)
ret
#endif // __aarch64__