mirror of https://github.com/alibaba/MNN.git
MNN:Sync: Sync Internal 2.9.6
This commit is contained in:
parent
f830294eef
commit
860fceb3ab
|
@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH
|
||||||
${CMAKE_MODULE_PATH}
|
${CMAKE_MODULE_PATH}
|
||||||
"${CMAKE_CURRENT_LIST_DIR}/cmake"
|
"${CMAKE_CURRENT_LIST_DIR}/cmake"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if(WIN32)
|
||||||
|
if(NOT MSVC)
|
||||||
|
set(CMAKE_MSVC_RUNTIME_LIBRARY "")
|
||||||
|
set(MSVC_RUNTIME_LIBRARY "")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# build options
|
# build options
|
||||||
option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
|
option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
|
||||||
option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
|
option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
|
||||||
|
@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF)
|
||||||
option(MNN_OPENCL "Enable OpenCL" OFF)
|
option(MNN_OPENCL "Enable OpenCL" OFF)
|
||||||
option(MNN_OPENGL "Enable OpenGL" OFF)
|
option(MNN_OPENGL "Enable OpenGL" OFF)
|
||||||
option(MNN_VULKAN "Enable Vulkan" OFF)
|
option(MNN_VULKAN "Enable Vulkan" OFF)
|
||||||
option(MNN_ARM82 "Enable ARM82" OFF)
|
option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
|
||||||
option(MNN_ONEDNN "Enable oneDNN" OFF)
|
option(MNN_ONEDNN "Enable oneDNN" OFF)
|
||||||
option(MNN_AVX512 "Enable AVX512" OFF)
|
option(MNN_AVX512 "Enable AVX512" OFF)
|
||||||
option(MNN_CUDA "Enable CUDA" OFF)
|
option(MNN_CUDA "Enable CUDA" OFF)
|
||||||
|
@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "")
|
||||||
# Add Thread dependency
|
# Add Thread dependency
|
||||||
find_package(Threads)
|
find_package(Threads)
|
||||||
list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
|
list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
if(WIN32)
|
||||||
|
if(NOT MSVC)
|
||||||
|
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
|
||||||
|
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT APPLE)
|
if (NOT APPLE)
|
||||||
if(MNN_OPENMP)
|
if(MNN_OPENMP)
|
||||||
|
|
36
MNN.sln
36
MNN.sln
|
@ -1,36 +0,0 @@
|
||||||
|
|
||||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
|
||||||
# Visual Studio Version 17
|
|
||||||
VisualStudioVersion = 17.5.002.0
|
|
||||||
MinimumVisualStudioVersion = 10.0.40219.1
|
|
||||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}"
|
|
||||||
EndProject
|
|
||||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}"
|
|
||||||
EndProject
|
|
||||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}"
|
|
||||||
EndProject
|
|
||||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}"
|
|
||||||
EndProject
|
|
||||||
Global
|
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
|
||||||
Debug|Any CPU = Debug|Any CPU
|
|
||||||
Release|Any CPU = Release|Any CPU
|
|
||||||
EndGlobalSection
|
|
||||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
|
||||||
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
|
||||||
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
|
||||||
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
|
||||||
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU
|
|
||||||
EndGlobalSection
|
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
|
||||||
HideSolutionNode = FALSE
|
|
||||||
EndGlobalSection
|
|
||||||
GlobalSection(NestedProjects) = preSolution
|
|
||||||
{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED}
|
|
||||||
{797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}
|
|
||||||
{E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A}
|
|
||||||
EndGlobalSection
|
|
||||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
|
||||||
SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6}
|
|
||||||
EndGlobalSection
|
|
||||||
EndGlobal
|
|
|
@ -1,77 +0,0 @@
|
||||||
//
|
|
||||||
// NEON_MNNConvRunForUnitDepthWise_BF16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2021/03/09.
|
|
||||||
// Copyright © 2018-2021 Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __arm__
|
|
||||||
#ifndef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function NEON_MNNConvRunForUnitDepthWise_BF16
|
|
||||||
//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
|
|
||||||
|
|
||||||
//Auto: r0:dst, r1:src, r2:weight, r3:fw
|
|
||||||
|
|
||||||
push {r4-r8, lr}
|
|
||||||
|
|
||||||
//Load from sp:
|
|
||||||
//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
|
|
||||||
mov r4, r3
|
|
||||||
ldr r5, [sp, #24]
|
|
||||||
ldr r6, [sp, #28]
|
|
||||||
ldr r7, [sp, #32]
|
|
||||||
ldr r8, [sp, #36]
|
|
||||||
|
|
||||||
cmp r4, #0
|
|
||||||
vmov.i32 q0, #0
|
|
||||||
beq UnitEnd
|
|
||||||
cmp r5, #0
|
|
||||||
beq UnitEnd
|
|
||||||
|
|
||||||
mov lr, #2
|
|
||||||
mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
|
|
||||||
mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
|
|
||||||
mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
|
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
|
|
||||||
mul lr, r4, r7
|
|
||||||
sub r8, r8, lr
|
|
||||||
|
|
||||||
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
|
|
||||||
mov lr, #8
|
|
||||||
mul lr, r4, lr
|
|
||||||
sub r6, r6, lr
|
|
||||||
|
|
||||||
|
|
||||||
UnitLoopH:
|
|
||||||
mov lr, r4
|
|
||||||
UnitLoopW:
|
|
||||||
vld1.16 {d2}, [r1], r7
|
|
||||||
vld1.16 {d4}, [r2]!
|
|
||||||
vshll.s16 q1, d2, #16
|
|
||||||
vshll.s16 q2, d4, #16
|
|
||||||
|
|
||||||
vmla.f32 q0, q1, q2
|
|
||||||
subs lr, lr, #1
|
|
||||||
bne UnitLoopW
|
|
||||||
subs r5, r5, #1
|
|
||||||
add r1, r1, r8
|
|
||||||
add r2, r2, r6
|
|
||||||
bne UnitLoopH
|
|
||||||
|
|
||||||
|
|
||||||
UnitEnd:
|
|
||||||
vshrn.i32 d0, q0, #16
|
|
||||||
vst1.16 {d0}, [r0]
|
|
||||||
|
|
||||||
pop {r4-r8, pc}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
|
@ -1,66 +0,0 @@
|
||||||
//
|
|
||||||
// NEON_MNNConvRunForUnitDepthWise_BF16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2021/03/09.
|
|
||||||
// Copyright © 2018-2021 Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function NEON_MNNConvRunForUnitDepthWise_BF16
|
|
||||||
//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
|
|
||||||
|
|
||||||
//Auto: x0:dst, x1:src, x2:weight, x3:fw
|
|
||||||
//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
|
|
||||||
|
|
||||||
cmp x3, #0
|
|
||||||
movi v0.4s, #0
|
|
||||||
beq UnitEnd
|
|
||||||
cmp x4, #0
|
|
||||||
beq UnitEnd
|
|
||||||
|
|
||||||
mov x9, #2
|
|
||||||
mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
|
|
||||||
mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
|
|
||||||
mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
|
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
|
|
||||||
mul x9, x3, x6
|
|
||||||
sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
|
|
||||||
|
|
||||||
//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
|
|
||||||
mov x9, #8
|
|
||||||
mul x9, x3, x9
|
|
||||||
sub x5, x5, x9
|
|
||||||
|
|
||||||
|
|
||||||
UnitLoopH:
|
|
||||||
mov x9, x3
|
|
||||||
UnitLoopW:
|
|
||||||
ld1 {v1.4h}, [x1], x6
|
|
||||||
ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
|
|
||||||
shll v1.4s, v1.4h, #16
|
|
||||||
shll v2.4s, v2.4h, #16
|
|
||||||
|
|
||||||
fmla v0.4s, v1.4s, v2.4s
|
|
||||||
subs x9, x9, #1
|
|
||||||
bne UnitLoopW
|
|
||||||
subs x4, x4, #1
|
|
||||||
add x1, x1, x7
|
|
||||||
add x2, x2, x5
|
|
||||||
bne UnitLoopH
|
|
||||||
|
|
||||||
|
|
||||||
UnitEnd:
|
|
||||||
shrn v0.4h, v0.4s, #16
|
|
||||||
st1 {v0.4h}, [x0]
|
|
||||||
|
|
||||||
ret
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
|
||||||
::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
|
::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
BFVec4 dstValue(0.0f);
|
|
||||||
const int16_t* src_z = (const int16_t*)src;
|
|
||||||
const int16_t* weight_z = (const int16_t*)weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
const auto src_y = src_z + fy * dilateY_step;
|
|
||||||
const auto weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
const auto weight_x = weight_y + 4 * fx;
|
|
||||||
const auto src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
BFVec4::save((int16_t*)dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
|
static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr;
|
||||||
bool BF16Functions::init() {
|
bool BF16Functions::init() {
|
||||||
gInstance = new CoreFunctions;
|
gInstance = new CoreFunctions;
|
||||||
gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
|
gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
|
||||||
gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
|
|
||||||
gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
|
gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
|
||||||
gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
|
gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
|
||||||
gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
|
gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
|
||||||
|
@ -890,7 +872,6 @@ bool BF16Functions::init() {
|
||||||
gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
|
gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
|
||||||
gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
|
gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
|
||||||
gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
|
gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
|
||||||
gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
|
|
||||||
gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
|
gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
|
||||||
#ifdef __aarch64__
|
#ifdef __aarch64__
|
||||||
cpuinfo_arm_isa gCPUInfo;
|
cpuinfo_arm_isa gCPUInfo;
|
||||||
|
|
|
@ -38,7 +38,7 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
|
||||||
| MNN_OPENCL | 是否构建`OpenCL`后端,默认为`OFF` |
|
| MNN_OPENCL | 是否构建`OpenCL`后端,默认为`OFF` |
|
||||||
| MNN_OPENGL | 是否构建`OpenGL`后端,默认为`OFF` |
|
| MNN_OPENGL | 是否构建`OpenGL`后端,默认为`OFF` |
|
||||||
| MNN_VULKAN | 是否构建`Vulkan`后端,默认为`OFF` |
|
| MNN_VULKAN | 是否构建`Vulkan`后端,默认为`OFF` |
|
||||||
| MNN_ARM82 | 是否构建`Armv8.2`后端,默认为`OFF` |
|
| MNN_ARM82 | 编译ARM架构时,是否构建`Armv8.2`后端,以支持FP16计算,默认为`ON` |
|
||||||
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
|
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
|
||||||
| MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` |
|
| MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` |
|
||||||
| MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` |
|
| MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` |
|
||||||
|
|
|
@ -22,37 +22,45 @@
|
||||||
```bash
|
```bash
|
||||||
mkdir build && cd build && cmake .. && make -j8
|
mkdir build && cd build && cmake .. && make -j8
|
||||||
```
|
```
|
||||||
## Windows
|
## Windows(非ARM架构)
|
||||||
- 环境要求
|
- 环境要求
|
||||||
- Microsoft Visual Studio >= 2017
|
- Microsoft Visual Studio >= 2017
|
||||||
- cmake >= 3.13
|
- cmake >= 3.13
|
||||||
- powershell
|
|
||||||
- Ninja
|
- Ninja
|
||||||
- 相关编译选项
|
- 相关编译选项
|
||||||
- 同`Linux/MacOS`
|
- 同`Linux/MacOS`
|
||||||
- 具体步骤
|
- 具体步骤
|
||||||
1. opencl/vulkan
|
- 64位编译:在设置中找到vcvars64.bat(适用于 VS 2017 的 x64 本机工具命令提示)并单击,打开VS编译x64架构程序的虚拟环境
|
||||||
- *(可选)*下载GPU Caps Viewer,你可以通过这个工具来查看本机设备的详细信息(opencl、opengl、vulkan等)
|
- 32位编译:在设置中找到vcvarsamd64_x86.bat(VS 2017的 x64_x86 交叉工具命令提示符)并单击,打开VS交叉编译x86架构程序的虚拟环境
|
||||||
- sdk和驱动准备
|
- 在虚拟环境中执行如下编译命令:
|
||||||
- [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases),将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量
|
```bash
|
||||||
- [vulkan sdk](https://vulkan.lunarg.com/),将vulkan skd路径加入VULKAN_SDK环境变量,以备cmake查找
|
cd /path/to/MNN
|
||||||
- [AMD opencl驱动](https://www.amd.com/zh-hans/support)
|
./schema/generate.ps1 # 非必须
|
||||||
- [NVIDIA opencl驱动](https://developer.nvidia.com/opencl)
|
mkdir build && cd build
|
||||||
- [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api)
|
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
|
||||||
2. 编译
|
ninja
|
||||||
- 64位编译:在设置中找到vcvars64.bat(适用于 VS 2017 的 x64 本机工具命令提示)并单击,打开VS编译x64架构程序的虚拟环境
|
```
|
||||||
- 32位编译:在设置中找到vcvarsamd64_x86.bat(VS 2017的 x64_x86 交叉工具命令提示符)并单击,打开VS交叉编译x86架构程序的虚拟环境
|
- 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
|
||||||
- 在虚拟环境中执行如下编译命令:
|
- 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
|
||||||
```bash
|
- Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题)
|
||||||
cd /path/to/MNN
|
|
||||||
./schema/generate.ps1 # 非必须
|
## Windows(ARM架构)
|
||||||
mkdir build && cd build
|
- 环境要求
|
||||||
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
|
- Microsoft Visual Studio >= 2017
|
||||||
ninja
|
- cmake >= 3.13
|
||||||
```
|
- Ninja
|
||||||
- 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
|
- Clang
|
||||||
- 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
|
- Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1
|
||||||
- Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题)
|
- 相关编译选项
|
||||||
|
- 同`Linux/MacOS`
|
||||||
|
- 具体步骤
|
||||||
|
- 打开vs的ARM64命令行工具
|
||||||
|
- 进入 MNN 根目录
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe" -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release
|
||||||
|
- Visual Studio 安装路径不一致的,可自行修改脚本
|
||||||
|
- ninja -j16
|
||||||
|
|
||||||
## Android
|
## Android
|
||||||
- 环境要求
|
- 环境要求
|
||||||
- cmake >= 3.10
|
- cmake >= 3.10
|
||||||
|
|
|
@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress,请查
|
||||||
| ADMM | 使用ADMM方法进行权值量化 |
|
| ADMM | 使用ADMM方法进行权值量化 |
|
||||||
|
|
||||||
## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
|
## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
|
||||||
| input_type | `str` | 输入数据的类型,"sequence" |
|
| 需要特别指定的参数 | 设置值 |
|
||||||
| path | `str` | 存放校正特征量化系数的输入数据目录 |,例如该目录下包含2个输入数据集input_0和input_1,子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹,可使用 testMNNFromOnnx.py 等脚本生成,参考模型转换的正确性校验部分。
|
|--------------------|------|
|
||||||
|
| input_type | `str`:输入数据的类型,"sequence" |
|
||||||
|
| path | `str`:存放校正特征量化系数的输入数据目录 |,
|
||||||
|
例如在quant.json文件中 "path": "/home/data/inputs_dir/",你所构造的矫正数据集有两个,分别存放在input_0和input_1子目录下,即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称,例如该模型的输入有三个:data0, data1, data2,输出有两个:out1, out2. 那么在input_0和input_1子目录下分别有六个文件:data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应,最后一个input.json文件则描述的是输入名和对应的shape内容:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"inputs": [
|
||||||
|
{
|
||||||
|
"name": "data0",
|
||||||
|
"shape": [
|
||||||
|
2,
|
||||||
|
4,
|
||||||
|
64,
|
||||||
|
64
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "data1",
|
||||||
|
"shape": [
|
||||||
|
1
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "data2",
|
||||||
|
"shape": [
|
||||||
|
2,
|
||||||
|
512,
|
||||||
|
768
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
"out1", "out2"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## 量化模型的使用
|
## 量化模型的使用
|
||||||
和浮点模型同样使用方法,输入输出仍然为浮点类型
|
和浮点模型同样使用方法,输入输出仍然为浮点类型
|
||||||
|
|
|
@ -40,13 +40,16 @@ python llmexport.py \
|
||||||
├── llm.mnn
|
├── llm.mnn
|
||||||
├── llm.mnn.json
|
├── llm.mnn.json
|
||||||
├── llm.mnn.weight
|
├── llm.mnn.weight
|
||||||
├── llm.onnx
|
├── onnx/
|
||||||
|
├──llm.onnx
|
||||||
|
├──llm.onnx.data
|
||||||
├── llm_config.json
|
├── llm_config.json
|
||||||
└── tokenizer.txt
|
└── tokenizer.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### 功能
|
### 功能
|
||||||
- 支持将模型为onnx或mnn模型,使用`--export onnx`或`--export mnn`
|
- 将模型先转为onnx模型,使用`--export onnx`,然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp
|
||||||
|
- 更快的方式:直接转为mnn模型,使用`--export mnn`,注意,你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址,两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址,那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具,需保证该目录下存在MNNConvert文件。
|
||||||
- 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容
|
- 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容
|
||||||
- 默认会使用onnx-slim对onnx模型进行优化,跳过该步骤使用`--skip_slim`
|
- 默认会使用onnx-slim对onnx模型进行优化,跳过该步骤使用`--skip_slim`
|
||||||
- 支持合并lora权重后导出,指定lora权重的目录使用`--lora_path`
|
- 支持合并lora权重后导出,指定lora权重的目录使用`--lora_path`
|
||||||
|
|
|
@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
|
||||||
ScheduleConfig sConfig;
|
ScheduleConfig sConfig;
|
||||||
sConfig.type = type;
|
sConfig.type = type;
|
||||||
type = Schedule::getApprociateType(sConfig);
|
type = Schedule::getApprociateType(sConfig);
|
||||||
auto creator = MNNGetExtraRuntimeCreator(type);
|
|
||||||
MNN_ASSERT(nullptr != creator);
|
|
||||||
Backend::Info info;
|
|
||||||
info.type = type;
|
|
||||||
info.mode = Backend::Info::DIRECT;
|
|
||||||
info.numThread = numberThread;
|
|
||||||
if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
|
|
||||||
info.numThread = 4;
|
|
||||||
}
|
|
||||||
mAttr->firstType = type;
|
|
||||||
auto firstIter = mRuntimes.find(mAttr->firstType);
|
|
||||||
if (firstIter == mRuntimes.end()) {
|
|
||||||
info.user = (BackendConfig*)&config;
|
|
||||||
std::shared_ptr<Runtime> bn(creator->onCreate(info));
|
|
||||||
mRuntimes[mAttr->firstType] = bn;
|
|
||||||
} else {
|
|
||||||
firstIter->second->onReset(numberThread, &config, true);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
auto creator = MNNGetExtraRuntimeCreator(type);
|
|
||||||
if (nullptr == creator) {
|
|
||||||
MNN_ERROR("Error to find creator of %d, set CPU default\n", type);
|
|
||||||
type = MNN_FORWARD_CPU;
|
|
||||||
creator = MNNGetExtraRuntimeCreator(type);
|
|
||||||
}
|
|
||||||
MNN_ASSERT(nullptr != creator);
|
|
||||||
Backend::Info info;
|
|
||||||
info.type = type;
|
|
||||||
mAttr->firstType = type;
|
|
||||||
auto firstIter = mRuntimes.find(mAttr->firstType);
|
|
||||||
if (firstIter == mRuntimes.end()) {
|
|
||||||
info.mode = Backend::Info::DIRECT;
|
|
||||||
info.numThread = numberThread;
|
|
||||||
info.user = (BackendConfig*)&config;
|
|
||||||
std::shared_ptr<Runtime> bn(creator->onCreate(info));
|
|
||||||
mRuntimes[mAttr->firstType] = bn;
|
|
||||||
} else {
|
|
||||||
firstIter->second->onReset(numberThread, &config, true);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
_refreshRuntime();
|
auto rt = _getOrCreateRuntime(type, &config, numberThread);
|
||||||
|
if (rt == nullptr) {
|
||||||
|
type = MNN_FORWARD_CPU;
|
||||||
|
numberThread = 1;
|
||||||
|
rt = _getOrCreateRuntime(type, &config, numberThread);
|
||||||
|
}
|
||||||
|
MNN_ASSERT(nullptr != rt);
|
||||||
|
mAttr->firstType = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
|
int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
|
||||||
return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
|
return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
|
||||||
|
}
|
||||||
|
std::shared_ptr<Runtime> Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) {
|
||||||
|
auto iter = mRuntimeInfo.first.find(type);
|
||||||
|
if (iter != mRuntimeInfo.first.end()) {
|
||||||
|
iter->second->onReset(numberThread, config, reset);
|
||||||
|
return iter->second;
|
||||||
|
}
|
||||||
|
// Create Backend
|
||||||
|
auto cre = MNNGetExtraRuntimeCreator(type);
|
||||||
|
if (nullptr == cre) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
Backend::Info info;
|
||||||
|
info.type = type;
|
||||||
|
info.mode = Backend::Info::DIRECT;
|
||||||
|
info.numThread = numberThread;
|
||||||
|
info.user = (BackendConfig*)config;
|
||||||
|
std::shared_ptr<Runtime> rt(cre->onCreate(info));
|
||||||
|
if (nullptr != rt) {
|
||||||
|
mRuntimeInfo.first.insert(std::make_pair(type, rt));
|
||||||
|
}
|
||||||
|
return rt;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Executor::gc(GCFlag flag) {
|
void Executor::gc(GCFlag flag) {
|
||||||
int level = flag == FULL ? 100 : 0;
|
int level = flag == FULL ? 100 : 0;
|
||||||
for (auto& iter : mRuntimes) {
|
for (auto& iter : mRuntimeInfo.first) {
|
||||||
iter.second->onGabageCollect(level);
|
iter.second->onGabageCollect(level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread) {
|
Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int numberThread) {
|
||||||
mRuntimes.insert(std::make_pair(type, backend));
|
mRuntimeInfo.first.insert(std::make_pair(type, runtime));
|
||||||
mAttr.reset(new ExecutorAttr);
|
mAttr.reset(new ExecutorAttr);
|
||||||
mAttr->firstType = type;
|
mAttr->firstType = type;
|
||||||
if (MNN_FORWARD_CPU != type) {
|
if (type == MNN_FORWARD_CPU) {
|
||||||
// Create Backup Backend
|
mRuntimeInfo.second = runtime;
|
||||||
Backend::Info info;
|
} else {
|
||||||
info.type = MNN_FORWARD_CPU;
|
mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1);
|
||||||
auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
|
|
||||||
info.mode = Backend::Info::DIRECT;
|
|
||||||
info.numThread = 1;
|
|
||||||
std::shared_ptr<Runtime> backupRt(cre->onCreate(info));
|
|
||||||
mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt));
|
|
||||||
}
|
}
|
||||||
mDebug.reset(new DebugTools);
|
mDebug.reset(new DebugTools);
|
||||||
BackendConfig defaultConfig;
|
BackendConfig defaultConfig;
|
||||||
defaultConfig.flags = 4;
|
defaultConfig.flags = 4;
|
||||||
std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
|
std::shared_ptr<Backend> defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig));
|
||||||
mAttr->constantBackend = defaultBackend;
|
mAttr->constantBackend = defaultBackend;
|
||||||
_refreshRuntime();
|
|
||||||
}
|
}
|
||||||
Executor::~Executor(){
|
Executor::~Executor(){
|
||||||
// Do nothing
|
// Do nothing
|
||||||
|
@ -176,21 +160,6 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
|
||||||
auto executor = new Executor(runtime, type, numberThread);
|
auto executor = new Executor(runtime, type, numberThread);
|
||||||
return std::shared_ptr<Executor>(executor);
|
return std::shared_ptr<Executor>(executor);
|
||||||
}
|
}
|
||||||
void Executor::_refreshRuntime() {
|
|
||||||
mRuntimeInfo.first.clear();
|
|
||||||
mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
|
|
||||||
auto firstIter = mRuntimes.find(getAttr()->firstType);
|
|
||||||
if (firstIter != mRuntimes.end()) {
|
|
||||||
mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second));
|
|
||||||
} else {
|
|
||||||
MNN_ASSERT(false);
|
|
||||||
}
|
|
||||||
for (auto& iter : mRuntimes) {
|
|
||||||
if (iter.first != getAttr()->firstType) {
|
|
||||||
mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
RuntimeInfo Executor::getRuntime() {
|
RuntimeInfo Executor::getRuntime() {
|
||||||
auto glo = ExecutorScope::Current();
|
auto glo = ExecutorScope::Current();
|
||||||
|
@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
|
||||||
auto res = new RuntimeManager;
|
auto res = new RuntimeManager;
|
||||||
auto glo = ExecutorScope::Current();
|
auto glo = ExecutorScope::Current();
|
||||||
std::lock_guard<std::mutex> _l(glo->mMutex);
|
std::lock_guard<std::mutex> _l(glo->mMutex);
|
||||||
auto& originRt = glo->mRuntimes;
|
auto& originRt = glo->mRuntimeInfo;
|
||||||
Backend::Info compute;
|
auto type = Schedule::getApprociateType(config);
|
||||||
compute.type = Schedule::getApprociateType(config);
|
int numThread = config.numThread;
|
||||||
compute.numThread = config.numThread;
|
|
||||||
if(config.type == MNN_FORWARD_AUTO) {
|
if(config.type == MNN_FORWARD_AUTO) {
|
||||||
if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) {
|
if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
|
||||||
// AUTO set default gpu-mode MNN_GPU_TUNING_FAST
|
// AUTO set default gpu-mode MNN_GPU_TUNING_FAST
|
||||||
compute.numThread = 16;
|
numThread = 16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
compute.user = config.backendConfig;
|
auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false);
|
||||||
auto iter = originRt.find(compute.type);
|
res->mInside->mRuntime.second = originRt.second;
|
||||||
if (iter == originRt.end()) {
|
res->mInside->mRuntime.first.insert(std::make_pair(type, rt));
|
||||||
auto creator = MNNGetExtraRuntimeCreator(compute.type);
|
res->mInside->mInfo = rt;
|
||||||
if (nullptr == creator) {
|
res->mInside->mNumberThread = numThread;
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
auto newBn = creator->onCreate(compute);
|
|
||||||
if (nullptr == newBn) {
|
|
||||||
MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type));
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
|
|
||||||
} else {
|
|
||||||
iter->second->onReset(compute.numThread, compute.user, false);
|
|
||||||
}
|
|
||||||
res->mInside->mRuntime.second = originRt[DEFAULT_BACKUP_RUNTIME_KEY];
|
|
||||||
res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
|
|
||||||
res->mInside->mInfo = originRt[compute.type];
|
|
||||||
res->mInside->mNumberThread = compute.numThread;
|
|
||||||
if (nullptr != config.backendConfig) {
|
if (nullptr != config.backendConfig) {
|
||||||
res->mInside->mConfig = *config.backendConfig;
|
res->mInside->mConfig = *config.backendConfig;
|
||||||
res->mInside->mUserConfig = true;
|
res->mInside->mUserConfig = true;
|
||||||
} else {
|
} else {
|
||||||
res->mInside->mUserConfig = false;
|
res->mInside->mUserConfig = false;
|
||||||
}
|
}
|
||||||
glo->_refreshRuntime();
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
ExecutorAttr* Executor::getAttr() const {
|
ExecutorAttr* Executor::getAttr() const {
|
||||||
|
|
|
@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
|
||||||
if (net->extraInfo() && net->extraInfo()->version()) {
|
if (net->extraInfo() && net->extraInfo()->version()) {
|
||||||
info->version = net->extraInfo()->version()->str();
|
info->version = net->extraInfo()->version()->str();
|
||||||
}
|
}
|
||||||
|
if (net->bizCode()) {
|
||||||
|
info->bizCode = net->bizCode()->str();
|
||||||
|
}
|
||||||
auto rtMgr = _rtMgr;
|
auto rtMgr = _rtMgr;
|
||||||
Module::Config defaultConfig;
|
Module::Config defaultConfig;
|
||||||
if (nullptr == config) {
|
if (nullptr == config) {
|
||||||
|
|
|
@ -598,6 +598,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
||||||
mSession->getInfo(Interpreter::FLOPS, &flops);
|
mSession->getInfo(Interpreter::FLOPS, &flops);
|
||||||
glo->getDebugTools()->flops += flops;
|
glo->getDebugTools()->flops += flops;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return outputs;
|
return outputs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -234,6 +234,8 @@ public:
|
||||||
// size limit of kvcache in memory (for a single layer)
|
// size limit of kvcache in memory (for a single layer)
|
||||||
// if the size of kvcache exceeds the limit, it will be moved to disk
|
// if the size of kvcache exceeds the limit, it will be moved to disk
|
||||||
KVCACHE_SIZE_LIMIT = 8,
|
KVCACHE_SIZE_LIMIT = 8,
|
||||||
|
// Op encoder number for commit
|
||||||
|
OP_ENCODER_NUMBER_FOR_COMMIT = 9,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ExternalPathType {
|
enum ExternalPathType {
|
||||||
|
|
|
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
||||||
#define STR(x) STR_IMP(x)
|
#define STR(x) STR_IMP(x)
|
||||||
#define MNN_VERSION_MAJOR 2
|
#define MNN_VERSION_MAJOR 2
|
||||||
#define MNN_VERSION_MINOR 9
|
#define MNN_VERSION_MINOR 9
|
||||||
#define MNN_VERSION_PATCH 5
|
#define MNN_VERSION_PATCH 6
|
||||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||||
#endif /* MNNDefine_h */
|
#endif /* MNNDefine_h */
|
||||||
|
|
|
@ -138,12 +138,10 @@ public:
|
||||||
};
|
};
|
||||||
static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
|
static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
|
||||||
private:
|
private:
|
||||||
void _refreshRuntime();
|
std::shared_ptr<Runtime> _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true);
|
||||||
Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
|
Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
|
||||||
void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
|
void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
|
||||||
|
|
||||||
// TODO: Remove mRuntimes, only use mRuntimeInfo
|
|
||||||
std::map<MNNForwardType, std::shared_ptr<Runtime>> mRuntimes;
|
|
||||||
RuntimeInfo mRuntimeInfo;
|
RuntimeInfo mRuntimeInfo;
|
||||||
std::shared_ptr<DebugTools> mDebug;
|
std::shared_ptr<DebugTools> mDebug;
|
||||||
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
|
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
|
||||||
|
|
|
@ -53,7 +53,7 @@ public:
|
||||||
MNNForwardType type = MNN_FORWARD_CPU;
|
MNNForwardType type = MNN_FORWARD_CPU;
|
||||||
BackendConfig* config = nullptr;
|
BackendConfig* config = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Config {
|
struct Config {
|
||||||
// Load module as dynamic, default static
|
// Load module as dynamic, default static
|
||||||
bool dynamic = false;
|
bool dynamic = false;
|
||||||
|
@ -75,7 +75,7 @@ public:
|
||||||
// Shared RuntimeManager
|
// Shared RuntimeManager
|
||||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
|
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
|
||||||
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
|
static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
|
||||||
|
|
||||||
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
|
static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
|
||||||
|
|
||||||
static Module* clone(const Module* module, const bool shareParams = false);
|
static Module* clone(const Module* module, const bool shareParams = false);
|
||||||
|
@ -93,6 +93,8 @@ public:
|
||||||
std::vector<std::string> outputNames;
|
std::vector<std::string> outputNames;
|
||||||
// The MNNConvert's Version build the module
|
// The MNNConvert's Version build the module
|
||||||
std::string version;
|
std::string version;
|
||||||
|
// The bizCode of MNN model
|
||||||
|
std::string bizCode;
|
||||||
};
|
};
|
||||||
const Info* getInfo() const;
|
const Info* getInfo() const;
|
||||||
class CloneContext {
|
class CloneContext {
|
||||||
|
|
|
@ -158,8 +158,6 @@
|
||||||
4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
|
||||||
4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
|
||||||
4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
|
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
|
||||||
|
@ -497,7 +495,6 @@
|
||||||
92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
|
92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
|
||||||
92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
|
92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
|
||||||
92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
|
92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
|
||||||
92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
|
|
||||||
92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
|
92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
|
||||||
92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
|
92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
|
||||||
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
|
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
|
||||||
|
@ -542,7 +539,6 @@
|
||||||
92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
|
92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
|
||||||
92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
|
92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
|
||||||
92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
|
92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
|
||||||
92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
|
|
||||||
92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
|
92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
|
||||||
92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
|
92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
|
||||||
92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
|
92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
|
||||||
|
@ -603,12 +599,10 @@
|
||||||
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
|
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
|
||||||
92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
|
92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
|
||||||
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
|
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
|
||||||
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
|
|
||||||
92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
|
92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
|
||||||
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
|
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
|
||||||
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
|
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
|
||||||
92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
|
92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
|
||||||
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
|
|
||||||
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
|
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
|
||||||
92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
|
92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
|
||||||
92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
|
92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
|
||||||
|
@ -790,6 +784,8 @@
|
||||||
CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
|
CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
|
||||||
CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
|
CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
|
||||||
CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
|
CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
|
||||||
|
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
|
||||||
|
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
|
||||||
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
|
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
|
||||||
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
|
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
|
||||||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
|
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
|
||||||
|
@ -1005,8 +1001,6 @@
|
||||||
4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
|
4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
|
||||||
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
|
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
|
||||||
4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
|
4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
|
||||||
4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
|
|
||||||
4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
|
|
||||||
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
|
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
|
||||||
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
|
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
|
||||||
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
|
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
|
||||||
|
@ -1353,7 +1347,6 @@
|
||||||
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
|
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
|
||||||
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
|
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
|
||||||
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
|
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
|
||||||
92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
|
|
||||||
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
|
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
|
||||||
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
|
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
|
||||||
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
|
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
|
||||||
|
@ -1398,7 +1391,6 @@
|
||||||
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
|
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
|
||||||
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
|
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
|
||||||
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
|
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
|
||||||
92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
|
|
||||||
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
|
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
|
||||||
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
|
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
|
||||||
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
|
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
|
||||||
|
@ -1459,12 +1451,10 @@
|
||||||
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
|
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
|
||||||
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
|
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
|
||||||
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
|
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
|
||||||
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
|
|
||||||
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
|
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
|
||||||
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
|
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
|
||||||
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
|
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
|
||||||
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
|
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
|
||||||
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
|
|
||||||
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
|
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
|
||||||
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
|
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
|
||||||
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
|
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
|
||||||
|
@ -1647,6 +1637,8 @@
|
||||||
CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
|
CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
|
||||||
CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
|
CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
|
||||||
CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
|
CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
|
||||||
|
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
|
||||||
|
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
|
||||||
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
|
||||||
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
|
||||||
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
|
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
|
||||||
|
@ -2648,7 +2640,6 @@
|
||||||
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
|
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
|
||||||
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
|
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
|
||||||
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
|
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
|
||||||
92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
|
|
||||||
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
|
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
|
||||||
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
|
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
|
||||||
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
|
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
|
||||||
|
@ -2659,6 +2650,8 @@
|
||||||
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
|
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
|
||||||
isa = PBXGroup;
|
isa = PBXGroup;
|
||||||
children = (
|
children = (
|
||||||
|
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */,
|
||||||
|
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */,
|
||||||
95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
|
95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
|
||||||
95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
|
95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
|
||||||
4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
|
4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
|
||||||
|
@ -2688,8 +2681,6 @@
|
||||||
4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
|
4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
|
||||||
4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
|
4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
|
||||||
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
|
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
|
||||||
4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
|
|
||||||
4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
|
|
||||||
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
|
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
|
||||||
4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
|
4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
|
||||||
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
|
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
|
||||||
|
@ -2743,7 +2734,6 @@
|
||||||
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
|
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
|
||||||
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
|
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
|
||||||
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
|
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
|
||||||
92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
|
|
||||||
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
|
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
|
||||||
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
|
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
|
||||||
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
|
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
|
||||||
|
@ -2795,12 +2785,10 @@
|
||||||
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
|
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
|
||||||
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
|
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
|
||||||
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
|
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
|
||||||
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
|
|
||||||
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
|
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
|
||||||
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
|
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
|
||||||
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
|
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
|
||||||
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
|
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
|
||||||
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
|
|
||||||
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
|
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
|
||||||
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
|
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
|
||||||
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
|
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
|
||||||
|
@ -3036,7 +3024,6 @@
|
||||||
4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
|
4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
|
||||||
4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
|
4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
|
||||||
48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
|
48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
|
||||||
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
|
|
||||||
CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
|
CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
|
||||||
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
|
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
|
||||||
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
|
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
|
||||||
|
@ -3394,14 +3381,12 @@
|
||||||
4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
|
4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
|
||||||
48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
|
48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
|
||||||
92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
|
92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
|
||||||
92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
|
|
||||||
92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
|
92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
|
||||||
92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
|
92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
|
||||||
92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
|
92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
|
||||||
CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
|
CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
|
||||||
EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
|
EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
|
||||||
481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
|
481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
|
||||||
92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
|
|
||||||
4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
|
4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
|
||||||
92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
|
92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
|
||||||
92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
|
92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
|
||||||
|
@ -3483,6 +3468,7 @@
|
||||||
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
|
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
|
||||||
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
|
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
|
||||||
92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
|
92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
|
||||||
|
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */,
|
||||||
4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
|
4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
|
||||||
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
|
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
|
||||||
4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
|
4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
|
||||||
|
@ -3592,7 +3578,6 @@
|
||||||
4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
|
4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
|
||||||
92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
|
92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
|
||||||
EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
|
||||||
4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
|
|
||||||
92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
|
92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
|
||||||
48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
|
48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
|
||||||
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
|
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
|
||||||
|
@ -3711,6 +3696,7 @@
|
||||||
4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
|
4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
|
||||||
92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
|
92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
|
||||||
92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
|
92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
|
||||||
|
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
|
||||||
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
|
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
|
||||||
92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
|
92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
|
||||||
CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
|
CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
|
||||||
|
@ -3771,7 +3757,6 @@
|
||||||
48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
|
48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
|
||||||
950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
|
950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
|
||||||
92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
|
92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
|
||||||
4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
|
|
||||||
EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
|
EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
|
||||||
CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
|
CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
|
||||||
48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
|
48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
|
||||||
|
@ -3800,7 +3785,6 @@
|
||||||
92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
|
92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
|
||||||
92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
|
92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
|
||||||
92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
|
92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
|
||||||
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
|
|
||||||
92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
|
92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
|
||||||
4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
|
4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
|
||||||
CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
|
CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
|
||||||
|
|
|
@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs):
|
||||||
memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
|
memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
|
||||||
power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
|
power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
|
||||||
precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
|
precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
|
||||||
thread_num = kwargs.get('thread_num', 4)
|
thread_num = kwargs.get('thread_num', 1)
|
||||||
|
|
||||||
module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
|
module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
|
||||||
backend, memory_mode, power_mode, precision_mode, thread_num)
|
backend, memory_mode, power_mode, precision_mode, thread_num)
|
||||||
|
@ -59,4 +59,4 @@ class EmptyModule(_nn._Module):
|
||||||
super(EmptyModule, self).__init__()
|
super(EmptyModule, self).__init__()
|
||||||
def forward(self):
|
def forward(self):
|
||||||
return None
|
return None
|
||||||
dummy = EmptyModule()
|
dummy = EmptyModule()
|
||||||
|
|
|
@ -13,6 +13,8 @@ try:
|
||||||
except:
|
except:
|
||||||
mnn_logger = None
|
mnn_logger = None
|
||||||
|
|
||||||
|
def convert(args):
|
||||||
|
Tools.mnnconvert(args)
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
arg_dict = {}
|
arg_dict = {}
|
||||||
|
@ -28,13 +30,13 @@ def parse_args():
|
||||||
if arg_value.startswith("--") or arg_value.startswith("-"):
|
if arg_value.startswith("--") or arg_value.startswith("-"):
|
||||||
arg_value = True
|
arg_value = True
|
||||||
arg_dict[arg_name] = arg_value
|
arg_dict[arg_name] = arg_value
|
||||||
|
|
||||||
return arg_dict
|
return arg_dict
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
""" main funcion """
|
""" main funcion """
|
||||||
Tools.mnnconvert(sys.argv)
|
convert(sys.argv)
|
||||||
|
|
||||||
arg_dict = parse_args()
|
arg_dict = parse_args()
|
||||||
|
|
||||||
|
@ -52,7 +54,7 @@ def main():
|
||||||
arg_dict.pop("MNNModel")
|
arg_dict.pop("MNNModel")
|
||||||
log_dict["detail"] = {"args": arg_dict, "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate}
|
log_dict["detail"] = {"args": arg_dict, "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate}
|
||||||
mnn_logger.put_log(log_dict, "convert")
|
mnn_logger.put_log(log_dict, "convert")
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ sys.argv = [sys.argv[0]] + unknown
|
||||||
IS_WINDOWS = (platform.system() == 'Windows')
|
IS_WINDOWS = (platform.system() == 'Windows')
|
||||||
IS_DARWIN = (platform.system() == 'Darwin')
|
IS_DARWIN = (platform.system() == 'Darwin')
|
||||||
IS_LINUX = (platform.system() == 'Linux')
|
IS_LINUX = (platform.system() == 'Linux')
|
||||||
|
IS_ARM = ('arm' in platform.processor())
|
||||||
BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn
|
BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn
|
||||||
|
|
||||||
USE_TRT = False
|
USE_TRT = False
|
||||||
|
@ -55,8 +56,8 @@ if len(sys.argv) > 1 and sys.argv[1] != None:
|
||||||
USE_OPENMP = True
|
USE_OPENMP = True
|
||||||
if "llm" in sys.argv[1]:
|
if "llm" in sys.argv[1]:
|
||||||
USE_LLM = True
|
USE_LLM = True
|
||||||
if "arm82" in sys.argv[1]:
|
|
||||||
USE_ARM82 = True
|
if IS_ARM: USE_ARM82 = True
|
||||||
|
|
||||||
print ("USE_INTERNAL:", USE_INTERNAL)
|
print ("USE_INTERNAL:", USE_INTERNAL)
|
||||||
print ("USE_TRT:", USE_TRT)
|
print ("USE_TRT:", USE_TRT)
|
||||||
|
@ -69,7 +70,6 @@ print ("USE_RENDER:", USE_RENDER)
|
||||||
print ("USE_SSE:", USE_SSE)
|
print ("USE_SSE:", USE_SSE)
|
||||||
print ("USE_OPENMP:", USE_OPENMP)
|
print ("USE_OPENMP:", USE_OPENMP)
|
||||||
print ("USE_LLM:", USE_LLM)
|
print ("USE_LLM:", USE_LLM)
|
||||||
print ("USE_ARM82:", USE_ARM82)
|
|
||||||
|
|
||||||
def build_deps():
|
def build_deps():
|
||||||
""" build depency """
|
""" build depency """
|
||||||
|
@ -92,6 +92,9 @@ def build_deps():
|
||||||
if USE_ARM82:
|
if USE_ARM82:
|
||||||
extra_opts += ' -DMNN_ARM82=ON'
|
extra_opts += ' -DMNN_ARM82=ON'
|
||||||
extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
|
extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
|
||||||
|
if IS_DARWIN:
|
||||||
|
# Mac / iOS System use GCD instead of MNN's thread pool
|
||||||
|
extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON '
|
||||||
|
|
||||||
if IS_WINDOWS:
|
if IS_WINDOWS:
|
||||||
os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
|
os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include <sstream>
|
||||||
#include "llm/llm.hpp"
|
#include "llm/llm.hpp"
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
|
||||||
if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
|
if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
MNN::Transformer::LlmStreamBuffer buffer(nullptr);
|
std::ostringstream null_os;
|
||||||
std::ostream null_os(&buffer);
|
|
||||||
auto res = self->llm->response(query, stream ? &std::cout : &null_os);
|
auto res = self->llm->response(query, stream ? &std::cout : &null_os);
|
||||||
return string2Object(res);
|
return string2Object(res);
|
||||||
}
|
}
|
||||||
|
|
|
@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) {
|
||||||
}
|
}
|
||||||
auto res = PyDict_New();
|
auto res = PyDict_New();
|
||||||
PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
|
PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
|
||||||
|
PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str()));
|
||||||
{
|
{
|
||||||
auto names = PyList_New(info->inputNames.size());
|
auto names = PyList_New(info->inputNames.size());
|
||||||
for (int i=0; i<info->inputNames.size(); ++i) {
|
for (int i=0; i<info->inputNames.size(); ++i) {
|
||||||
|
@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
|
||||||
}
|
}
|
||||||
for (auto i = 0; i < PySequence_Size(dicts); ++i) {
|
for (auto i = 0; i < PySequence_Size(dicts); ++i) {
|
||||||
backendConfig[i].sharedContext = nullptr;
|
backendConfig[i].sharedContext = nullptr;
|
||||||
|
config[i].numThread = 1;
|
||||||
config[i].backendConfig = &backendConfig[i];
|
config[i].backendConfig = &backendConfig[i];
|
||||||
bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
|
bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
|
@ -392,7 +394,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
|
||||||
} else {
|
} else {
|
||||||
m_ptr = Executor::RuntimeManager::createRuntimeManager(configs);
|
m_ptr = Executor::RuntimeManager::createRuntimeManager(configs);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_ptr == nullptr) {
|
if (m_ptr == nullptr) {
|
||||||
printf("config size:%d\n", configs.size());
|
printf("config size:%d\n", configs.size());
|
||||||
std::string mnn_errno = "create_runtime_manager failed ";
|
std::string mnn_errno = "create_runtime_manager failed ";
|
||||||
|
|
|
@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
|
||||||
#endif
|
#endif
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
|
void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
|
||||||
|
void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
#endif
|
#endif
|
||||||
void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
|
|
||||||
|
|
||||||
void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
|
|
||||||
|
|
||||||
void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
|
||||||
|
@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float*
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
|
|
||||||
constexpr int pack = 8;
|
|
||||||
int unit = ow / 2;
|
|
||||||
auto biasF = Vec::load((const float16_t*)bias);
|
|
||||||
auto minF = Vec(parameters[2]);
|
|
||||||
auto maxF = Vec(parameters[3]);
|
|
||||||
MNN_ASSERT(cacheLineSize >= 1);
|
|
||||||
for (int x = 0; x < unit; ++x) {
|
|
||||||
int offset = 4 * pack * x, i = 0;
|
|
||||||
Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
|
|
||||||
Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
|
|
||||||
Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
|
|
||||||
Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
|
|
||||||
for (i = 1; i < cacheLineSize; ++i) {
|
|
||||||
m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
|
|
||||||
m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
|
|
||||||
m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
|
|
||||||
m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
|
|
||||||
}
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
auto o1 = m1 - m2 + m3 + biasF;
|
|
||||||
o0 = Vec::min(maxF, o0);
|
|
||||||
o1 = Vec::min(maxF, o1);
|
|
||||||
o0 = Vec::max(minF, o0);
|
|
||||||
o1 = Vec::max(minF, o1);
|
|
||||||
Vec::save(dest + (2 * x + 0) * pack, o0);
|
|
||||||
Vec::save(dest + (2 * x + 1) * pack, o1);
|
|
||||||
}
|
|
||||||
if (unit * 2 < ow) {
|
|
||||||
int offset = 4 * pack * unit, i = 0;
|
|
||||||
Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
|
|
||||||
Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
|
|
||||||
Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
|
|
||||||
for (i = 1; i < cacheLineSize; ++i) {
|
|
||||||
m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
|
|
||||||
m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
|
|
||||||
m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
|
|
||||||
}
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
o0 = Vec::min(maxF, o0);
|
|
||||||
o0 = Vec::max(minF, o0);
|
|
||||||
Vec::save(dest + 2 * unit * pack, o0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// unit: winograd unit (output is w/2)
|
|
||||||
void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
|
|
||||||
constexpr int pack = 8; // float16x8
|
|
||||||
for (int x = 0; x < su; ++x) {
|
|
||||||
auto dstX = dest + 4 * pack * x;
|
|
||||||
auto sx = x * 2 - (int)pad;
|
|
||||||
auto ex = sx + 4;
|
|
||||||
auto clampSx = std::max(sx, 0);
|
|
||||||
auto clampEx = std::min(ex, (int)iw);
|
|
||||||
Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
||||||
for (int i = clampSx; i < clampEx; ++i) {
|
|
||||||
v[i - sx] = Vec::load(source + pack * i);
|
|
||||||
}
|
|
||||||
auto m0 = v[0] - v[2];
|
|
||||||
auto m1 = v[1] + v[2];
|
|
||||||
auto m2 = v[2] - v[1];
|
|
||||||
auto m3 = v[3] - v[1];
|
|
||||||
Vec::save(dstX + pack * 0, m0);
|
|
||||||
Vec::save(dstX + pack * 1, m1);
|
|
||||||
Vec::save(dstX + pack * 2, m2);
|
|
||||||
Vec::save(dstX + pack * 3, m3);
|
|
||||||
}
|
|
||||||
MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
|
|
||||||
for (int x = eu; x < unit; ++x) {
|
|
||||||
auto dstX = dest + 4 * pack * x;
|
|
||||||
auto sx = x * 2 - (int)pad;
|
|
||||||
auto ex = sx + 4;
|
|
||||||
auto clampSx = std::max(sx, 0);
|
|
||||||
auto clampEx = std::min(ex, (int)iw);
|
|
||||||
Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
||||||
for (int i = clampSx; i < clampEx; ++i) {
|
|
||||||
v[i - sx] = Vec::load(source + pack * i);
|
|
||||||
}
|
|
||||||
auto m0 = v[0] - v[2];
|
|
||||||
auto m1 = v[1] + v[2];
|
|
||||||
auto m2 = v[2] - v[1];
|
|
||||||
auto m3 = v[3] - v[1];
|
|
||||||
Vec::save(dstX + pack * 0, m0);
|
|
||||||
Vec::save(dstX + pack * 1, m1);
|
|
||||||
Vec::save(dstX + pack * 2, m2);
|
|
||||||
Vec::save(dstX + pack * 3, m3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
|
void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
|
||||||
size_t cStride, size_t eSub, size_t hSub) {
|
size_t cStride, size_t eSub, size_t hSub) {
|
||||||
const int pack = 8;
|
const int pack = 8;
|
||||||
|
@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
Vec dstValue(0.0f);
|
|
||||||
auto src_z = (const FLOAT16*)src;
|
|
||||||
auto weight_z = (const FLOAT16*)weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
auto src_y = src_z + fy * dilateY_step;
|
|
||||||
auto weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
auto weight_x = weight_y + 8 * fx;
|
|
||||||
auto src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Vec::save((FLOAT16*)dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
|
static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
||||||
int fx, fy;
|
int fx, fy;
|
||||||
|
@ -706,12 +600,8 @@ bool Arm82Functions::init() {
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
|
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
|
FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
|
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
|
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
|
FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
|
FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
|
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
|
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
|
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
|
FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
|
FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
|
FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
|
||||||
|
@ -754,6 +644,7 @@ bool Arm82Functions::init() {
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
|
FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
|
||||||
#endif
|
#endif
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
|
FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
|
||||||
|
FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16);
|
||||||
#endif
|
#endif
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
|
FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
|
||||||
FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
|
FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
|
||||||
|
|
|
@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
|
||||||
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
|
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
|
||||||
add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
|
add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
|
||||||
target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
|
target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
|
||||||
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
|
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
|
||||||
if (MNN_LOW_MEMORY)
|
if (MNN_LOW_MEMORY)
|
||||||
file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
|
file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
|
||||||
|
|
|
@ -1,147 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvDwF23MulTransUnitFP16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/4.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
#ifdef __arm__
|
|
||||||
#ifndef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvDwF23MulTransUnitFP16
|
|
||||||
//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
|
|
||||||
//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
|
|
||||||
push {r4-r11, lr}
|
|
||||||
ldr r8, [sp, #36] // biasPtr
|
|
||||||
ldr r9, [sp, #40] // postParameters
|
|
||||||
ldr r10, [r9, #8] // minF
|
|
||||||
ldr r11, [r9, #12] // maxF
|
|
||||||
|
|
||||||
vpush {q4-q7}
|
|
||||||
ldr r4, [r0, #0]
|
|
||||||
ldr r5, [r0, #4]
|
|
||||||
ldr r6, [r0, #8]
|
|
||||||
|
|
||||||
vld1.16 {q4, q5}, [r1]!
|
|
||||||
vld1.16 {q6, q7}, [r1]!
|
|
||||||
vld1.16 {q8, q9}, [r1]!
|
|
||||||
|
|
||||||
L2:
|
|
||||||
cmp r3, #2
|
|
||||||
blt L1
|
|
||||||
|
|
||||||
LoopL2:
|
|
||||||
mov r7, r1
|
|
||||||
|
|
||||||
vld1.16 {q12, q13}, [r4]!
|
|
||||||
vmul.f16 q0, q4, q12
|
|
||||||
vld1.16 {q14, q15}, [r4]!
|
|
||||||
vmul.f16 q1, q5, q13
|
|
||||||
vld1.16 {q10, q11}, [r7]!
|
|
||||||
vmul.f16 q2, q6, q14
|
|
||||||
vld1.16 {q12, q13}, [r5]!
|
|
||||||
vmul.f16 q3, q7, q15
|
|
||||||
|
|
||||||
vmla.f16 q0, q8, q12
|
|
||||||
vld1.16 {q14, q15}, [r5]!
|
|
||||||
vmla.f16 q1, q9, q13
|
|
||||||
vmla.f16 q2, q10, q14
|
|
||||||
vmla.f16 q3, q11, q15
|
|
||||||
|
|
||||||
vld1.16 {q10, q11}, [r7]!
|
|
||||||
vld1.16 {q12, q13}, [r6]!
|
|
||||||
vmla.f16 q0, q10, q12
|
|
||||||
vmla.f16 q1, q11, q13
|
|
||||||
vld1.16 {q10, q11}, [r7]!
|
|
||||||
vadd.f16 q0, q1, q0
|
|
||||||
vld1.16 {q14, q15}, [r6]!
|
|
||||||
|
|
||||||
vmla.f16 q2, q10, q14
|
|
||||||
vmla.f16 q3, q11, q15
|
|
||||||
vadd.f16 q0, q0, q2
|
|
||||||
|
|
||||||
vadd.f16 q3, q3, q1
|
|
||||||
vsub.f16 q1, q3, q2
|
|
||||||
|
|
||||||
vld1.32 {q10}, [r8]
|
|
||||||
vdup.32 q11, r10
|
|
||||||
vdup.32 q12, r11
|
|
||||||
vcvt.f16.f32 d22, q11
|
|
||||||
vcvt.f16.f32 d24, q12
|
|
||||||
vmov.32 d23, d22
|
|
||||||
vmov.32 d25, d24
|
|
||||||
|
|
||||||
vadd.f16 q0, q10, q0
|
|
||||||
vadd.f16 q1, q10, q1
|
|
||||||
|
|
||||||
vmin.f16 q0, q12, q0
|
|
||||||
vmin.f16 q1, q12, q1
|
|
||||||
|
|
||||||
vmax.f16 q0, q11, q0
|
|
||||||
vmax.f16 q1, q11, q1
|
|
||||||
|
|
||||||
|
|
||||||
vst1.16 {q0, q1}, [r2]!
|
|
||||||
|
|
||||||
sub r3, r3, #2
|
|
||||||
cmp r3, #2
|
|
||||||
bge LoopL2
|
|
||||||
|
|
||||||
|
|
||||||
L1:
|
|
||||||
cmp r3, #0
|
|
||||||
beq End
|
|
||||||
mov r7, r1
|
|
||||||
mov r12, #32
|
|
||||||
vld1.16 {q12, q13}, [r4]!
|
|
||||||
vmul.f16 q0, q4, q12
|
|
||||||
vld1.16 {q14}, [r4]!
|
|
||||||
vmul.f16 q1, q5, q13
|
|
||||||
vld1.16 {q10}, [r7], r12
|
|
||||||
vmul.f16 q2, q6, q14
|
|
||||||
vld1.16 {q12, q13}, [r5]!
|
|
||||||
|
|
||||||
vmla.f16 q0, q8, q12
|
|
||||||
vld1.16 {q14}, [r5]!
|
|
||||||
vmla.f16 q1, q9, q13
|
|
||||||
vmla.f16 q2, q10, q14
|
|
||||||
|
|
||||||
vld1.16 {q10, q11}, [r7]!
|
|
||||||
vld1.16 {q12, q13}, [r6]!
|
|
||||||
vmla.f16 q0, q10, q12
|
|
||||||
vmla.f16 q1, q11, q13
|
|
||||||
vld1.16 {q10}, [r7]
|
|
||||||
vld1.16 {q14}, [r6]!
|
|
||||||
|
|
||||||
vmla.f16 q2, q10, q14
|
|
||||||
|
|
||||||
vadd.f16 q0, q1, q0
|
|
||||||
vadd.f16 q0, q0, q2
|
|
||||||
|
|
||||||
vld1.32 {q10}, [r8]
|
|
||||||
vdup.32 q11, r10
|
|
||||||
vdup.32 q12, r11
|
|
||||||
vcvt.f16.f32 d22, q11
|
|
||||||
vcvt.f16.f32 d24, q12
|
|
||||||
vmov.32 d23, d22
|
|
||||||
vmov.32 d25, d24
|
|
||||||
|
|
||||||
vadd.f16 q0, q10, q0
|
|
||||||
|
|
||||||
vmin.f16 q0, q12, q0
|
|
||||||
|
|
||||||
vmax.f16 q0, q11, q0
|
|
||||||
|
|
||||||
vst1.16 {q0}, [r2]!
|
|
||||||
End:
|
|
||||||
|
|
||||||
vpop {q4-q7}
|
|
||||||
pop {r4-r11, pc}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
|
@ -1,60 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvDwF23SourceTransUnitFP16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/4.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
#ifdef __arm__
|
|
||||||
#ifndef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvDwF23SourceTransUnitFP16
|
|
||||||
// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
|
|
||||||
|
|
||||||
//Auto:
|
|
||||||
//r0: source, r1:dest, r2:unit
|
|
||||||
|
|
||||||
push {lr}
|
|
||||||
|
|
||||||
L1:
|
|
||||||
cmp r2, #0
|
|
||||||
beq End
|
|
||||||
|
|
||||||
vld1.16 {q8, q9}, [r0]!
|
|
||||||
vld1.16 {q10, q11}, [r0]!
|
|
||||||
subs r2, r2, #1
|
|
||||||
vsub.f16 q0, q8, q10
|
|
||||||
vadd.f16 q1, q9, q10
|
|
||||||
beq L1LoopEnd
|
|
||||||
|
|
||||||
L1Loop:
|
|
||||||
vsub.f16 q2, q10, q9
|
|
||||||
vst1.16 {q0, q1}, [r1]!
|
|
||||||
vsub.f16 q3, q11, q9
|
|
||||||
vmov.i32 q8, q10
|
|
||||||
vst1.16 {q2, q3}, [r1]!
|
|
||||||
vmov.i32 q9, q11
|
|
||||||
vld1.16 {q10, q11}, [r0]!
|
|
||||||
vsub.f16 q0, q8, q10
|
|
||||||
vadd.f16 q1, q9, q10
|
|
||||||
|
|
||||||
subs r2, r2, #1
|
|
||||||
bne L1Loop
|
|
||||||
L1LoopEnd:
|
|
||||||
vsub.f16 q2, q10, q9
|
|
||||||
vsub.f16 q3, q11, q9
|
|
||||||
|
|
||||||
vst1.16 {q0, q1}, [r1]!
|
|
||||||
vst1.16 {q2, q3}, [r1]!
|
|
||||||
|
|
||||||
|
|
||||||
End:
|
|
||||||
|
|
||||||
pop {pc}
|
|
||||||
#endif
|
|
||||||
#endif
|
|
|
@ -16,26 +16,35 @@
|
||||||
|
|
||||||
asm_function MNNConvRunForLineDepthwiseFP16
|
asm_function MNNConvRunForLineDepthwiseFP16
|
||||||
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
|
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
|
||||||
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
|
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
|
||||||
|
// const float* bias, const float* parameters)
|
||||||
|
|
||||||
|
|
||||||
//Auto Load:
|
//Auto Load:
|
||||||
//r0:dst, r1:src, r2:weight, r3:width
|
//r0:dst, r1:src, r2:weight, r3:width
|
||||||
|
|
||||||
push {r4-r11, lr}
|
push {r4-r8, r10, r11, lr}
|
||||||
|
|
||||||
//Load From Sp
|
//Load From Sp
|
||||||
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
|
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
|
||||||
ldr r4, [sp, #36]
|
ldr r4, [sp, #32]
|
||||||
ldr r5, [sp, #40]
|
ldr r5, [sp, #36]
|
||||||
ldr r6, [sp, #44]
|
ldr r6, [sp, #40]
|
||||||
ldr r7, [sp, #48]
|
ldr r7, [sp, #44]
|
||||||
ldr r8, [sp, #52]
|
ldr r8, [sp, #48]
|
||||||
ldr r9, [sp, #56]
|
ldr lr, [sp, #52]
|
||||||
ldr r10, [sp, #60]
|
ldr r10, [sp, #56]
|
||||||
ldr r11, [sp, #64]
|
ldr r11, [sp, #60]
|
||||||
|
ldr r12, [sp, #64] // bias
|
||||||
|
vld1.32 {q0}, [r12] // bias
|
||||||
|
ldr r12, [sp, #68] // min,max
|
||||||
|
vld1.32 {d2[0]}, [r12]!
|
||||||
|
vld1.32 {d2[1]}, [r12]
|
||||||
|
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
|
vmov.f32 q5, q0 // bias
|
||||||
|
vdup.f32 q4, d2[0] // min
|
||||||
|
vdup.f32 q6, d2[1] // max
|
||||||
|
|
||||||
mov r12, #2 // sizeof(FLOAT16)
|
mov r12, #2 // sizeof(FLOAT16)
|
||||||
mul r4, r12, r4
|
mul r4, r12, r4
|
||||||
|
@ -49,7 +58,7 @@ mul r12, r5, r7
|
||||||
sub r8, r8, r12
|
sub r8, r8, r12
|
||||||
|
|
||||||
LoopDY:
|
LoopDY:
|
||||||
push {r0, r1, r3, r9, r10, r11}
|
push {r0, r1, r3, r10, r11, lr}
|
||||||
|
|
||||||
L8:
|
L8:
|
||||||
cmp r3, #7
|
cmp r3, #7
|
||||||
|
@ -59,18 +68,18 @@ mov r12, #8
|
||||||
mul r12, r4, r12
|
mul r12, r4, r12
|
||||||
|
|
||||||
L8Loop:
|
L8Loop:
|
||||||
vmov.i32 q8, #0
|
vmov.f32 q8, q5 // use bias to init
|
||||||
vmov.i32 q9, #0
|
vmov.f32 q9, q5
|
||||||
vmov.i32 q10, #0
|
vmov.f32 q10, q5
|
||||||
vmov.i32 q11, #0
|
vmov.f32 q11, q5
|
||||||
vmov.i32 q12, #0
|
vmov.f32 q12, q5
|
||||||
vmov.i32 q13, #0
|
vmov.f32 q13, q5
|
||||||
vmov.i32 q14, #0
|
vmov.f32 q14, q5
|
||||||
vmov.i32 q15, #0
|
vmov.f32 q15, q5
|
||||||
|
|
||||||
vmov.i32 d14[0], r1
|
vmov.i32 d14[0], r1
|
||||||
vmov.i32 d14[1], r2
|
vmov.i32 d14[1], r2
|
||||||
mov r9, r6
|
mov lr, r6
|
||||||
L8LoopH:
|
L8LoopH:
|
||||||
mov r10, r5
|
mov r10, r5
|
||||||
L8LoopW:
|
L8LoopW:
|
||||||
|
@ -98,11 +107,27 @@ L8Loop:
|
||||||
|
|
||||||
bne L8LoopW
|
bne L8LoopW
|
||||||
L8LoopWEnd:
|
L8LoopWEnd:
|
||||||
subs r9, r9, #1
|
subs lr, lr, #1
|
||||||
add r1, r1, r8
|
add r1, r1, r8
|
||||||
bne L8LoopH
|
bne L8LoopH
|
||||||
|
|
||||||
sub r3, r3, #8
|
sub r3, r3, #8
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmax.f32 q12, q12, q4
|
||||||
|
vmax.f32 q13, q13, q4
|
||||||
|
vmax.f32 q14, q14, q4
|
||||||
|
vmax.f32 q15, q15, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
|
vmin.f32 q12, q12, q6
|
||||||
|
vmin.f32 q13, q13, q6
|
||||||
|
vmin.f32 q14, q14, q6
|
||||||
|
vmin.f32 q15, q15, q6
|
||||||
vst1.16 {q8, q9}, [r0]!
|
vst1.16 {q8, q9}, [r0]!
|
||||||
vmov.i32 r1, d14[0]
|
vmov.i32 r1, d14[0]
|
||||||
vmov.i32 r2, d14[1]
|
vmov.i32 r2, d14[1]
|
||||||
|
@ -121,14 +146,14 @@ mov r12, #4
|
||||||
mul r12, r4, r12
|
mul r12, r4, r12
|
||||||
|
|
||||||
L4Loop:
|
L4Loop:
|
||||||
vmov.i32 q8, #0
|
vmov.f32 q8, q5
|
||||||
vmov.i32 q9, #0
|
vmov.f32 q9, q5
|
||||||
vmov.i32 q10, #0
|
vmov.f32 q10, q5
|
||||||
vmov.i32 q11, #0
|
vmov.f32 q11, q5
|
||||||
|
|
||||||
vmov.i32 d8[0], r1
|
vmov.i32 d14[0], r1
|
||||||
vmov.i32 d9[0], r2
|
vmov.i32 d14[1], r2
|
||||||
mov r9, r6
|
mov lr, r6
|
||||||
L4LoopH:
|
L4LoopH:
|
||||||
mov r10, r5
|
mov r10, r5
|
||||||
L4LoopW:
|
L4LoopW:
|
||||||
|
@ -147,14 +172,22 @@ L4Loop:
|
||||||
add r1, r1, r7
|
add r1, r1, r7
|
||||||
|
|
||||||
bne L4LoopW
|
bne L4LoopW
|
||||||
subs r9, r9, #1
|
subs lr, lr, #1
|
||||||
add r1, r1, r8
|
add r1, r1, r8
|
||||||
bne L4LoopH
|
bne L4LoopH
|
||||||
|
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
sub r3, r3, #4
|
sub r3, r3, #4
|
||||||
vst1.16 {q8, q9}, [r0]!
|
vst1.16 {q8, q9}, [r0]!
|
||||||
vmov.i32 r1, d8[0]
|
vmov.i32 r1, d14[0]
|
||||||
vmov.i32 r2, d9[0]
|
vmov.i32 r2, d14[1]
|
||||||
vst1.16 {q10, q11}, [r0]!
|
vst1.16 {q10, q11}, [r0]!
|
||||||
add r1, r1, r12
|
add r1, r1, r12
|
||||||
cmp r3, #4
|
cmp r3, #4
|
||||||
|
@ -168,8 +201,8 @@ cmp r3, #0
|
||||||
beq End
|
beq End
|
||||||
|
|
||||||
L1Loop:
|
L1Loop:
|
||||||
vmov.i32 q0, #0
|
vmov.f32 q0, q5
|
||||||
mov r9, r6
|
mov lr, r6
|
||||||
mov r11, r1
|
mov r11, r1
|
||||||
mov r12, r2
|
mov r12, r2
|
||||||
L1LoopH:
|
L1LoopH:
|
||||||
|
@ -180,10 +213,12 @@ L1Loop:
|
||||||
vmla.f16 q0, q1, q2
|
vmla.f16 q0, q1, q2
|
||||||
subs r10, r10, #1
|
subs r10, r10, #1
|
||||||
bne L1LoopW
|
bne L1LoopW
|
||||||
subs r9, r9, #1
|
subs lr, lr, #1
|
||||||
add r1, r1, r8
|
add r1, r1, r8
|
||||||
bne L1LoopH
|
bne L1LoopH
|
||||||
|
|
||||||
|
vmax.f32 q0, q0, q4
|
||||||
|
vmin.f32 q0, q0, q6
|
||||||
subs r3, r3, #1
|
subs r3, r3, #1
|
||||||
vst1.16 {q0}, [r0]!
|
vst1.16 {q0}, [r0]!
|
||||||
mov r2, r12
|
mov r2, r12
|
||||||
|
@ -193,16 +228,15 @@ L1Loop:
|
||||||
|
|
||||||
End:
|
End:
|
||||||
|
|
||||||
pop {r0, r1, r3, r9, r10, r11}
|
pop {r0, r1, r3, r10, r11, lr}
|
||||||
add r0, r0, r11
|
add r0, r0, r11
|
||||||
subs r9, r9, #1
|
subs lr, lr, #1
|
||||||
add r1, r1, r10
|
add r1, r1, r10
|
||||||
bne LoopDY
|
bne LoopDY
|
||||||
|
|
||||||
|
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
pop {r4-r11, pc}
|
pop {r4-r8, r10, r11, pc}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,122 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvDwF23MulTransUnitFP16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/4.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
#ifdef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvDwF23MulTransUnitFP16
|
|
||||||
//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
|
|
||||||
//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
|
|
||||||
|
|
||||||
stp d10, d11, [sp, #-32]!
|
|
||||||
stp d8, d9, [sp, #16]
|
|
||||||
|
|
||||||
ld1 {v8.8h}, [x4] // bias
|
|
||||||
ldr w9, [x5, #8]
|
|
||||||
ldr w10, [x5, #12]
|
|
||||||
dup v9.4s, w9 // min
|
|
||||||
dup v10.4s, w10 // max
|
|
||||||
fcvtn v9.4h, v9.4s
|
|
||||||
fcvtn v10.4h, v10.4s
|
|
||||||
dup v9.8h, v9.h[0]
|
|
||||||
dup v10.8h, v10.h[0]
|
|
||||||
|
|
||||||
ldr x4, [x0, #0]
|
|
||||||
ldr x5, [x0, #8]
|
|
||||||
ldr x6, [x0, #16]
|
|
||||||
|
|
||||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
|
|
||||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
|
|
||||||
ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
|
|
||||||
|
|
||||||
L2:
|
|
||||||
cmp x3, #2
|
|
||||||
blt L1
|
|
||||||
|
|
||||||
LoopL2:
|
|
||||||
|
|
||||||
ld1 {v20.8h, v21.8h}, [x4], #32
|
|
||||||
fmul v0.8h, v4.8h, v20.8h
|
|
||||||
ld1 {v22.8h, v23.8h}, [x4], #32
|
|
||||||
fmul v1.8h, v5.8h, v21.8h
|
|
||||||
fmul v2.8h, v6.8h, v22.8h
|
|
||||||
ld1 {v20.8h, v21.8h}, [x5], #32
|
|
||||||
fmul v3.8h, v7.8h, v23.8h
|
|
||||||
|
|
||||||
fmla v0.8h, v16.8h, v20.8h
|
|
||||||
ld1 {v22.8h, v23.8h}, [x5], #32
|
|
||||||
fmla v1.8h, v17.8h, v21.8h
|
|
||||||
fmla v2.8h, v18.8h, v22.8h
|
|
||||||
fmla v3.8h, v19.8h, v23.8h
|
|
||||||
|
|
||||||
ld1 {v20.8h, v21.8h}, [x6], #32
|
|
||||||
fmla v0.8h, v28.8h, v20.8h
|
|
||||||
fmla v1.8h, v29.8h, v21.8h
|
|
||||||
fadd v0.8h, v1.8h, v0.8h
|
|
||||||
ld1 {v22.8h, v23.8h}, [x6], #32
|
|
||||||
|
|
||||||
fmla v2.8h, v30.8h, v22.8h
|
|
||||||
fmla v3.8h, v31.8h, v23.8h
|
|
||||||
fadd v0.8h, v0.8h, v2.8h
|
|
||||||
|
|
||||||
fadd v3.8h, v3.8h, v1.8h
|
|
||||||
fsub v1.8h, v3.8h, v2.8h
|
|
||||||
|
|
||||||
fadd v0.8h, v0.8h, v8.8h
|
|
||||||
fadd v1.8h, v1.8h, v8.8h
|
|
||||||
|
|
||||||
fmin v0.8h, v0.8h, v10.8h
|
|
||||||
fmin v1.8h, v1.8h, v10.8h
|
|
||||||
|
|
||||||
fmax v0.8h, v0.8h, v9.8h
|
|
||||||
fmax v1.8h, v1.8h, v9.8h
|
|
||||||
|
|
||||||
st1 {v0.8h, v1.8h}, [x2], #32
|
|
||||||
|
|
||||||
sub x3, x3, #2
|
|
||||||
cmp x3, #2
|
|
||||||
bge LoopL2
|
|
||||||
|
|
||||||
|
|
||||||
L1:
|
|
||||||
cmp x3, #0
|
|
||||||
beq End
|
|
||||||
ld1 {v20.8h, v21.8h, v22.8h}, [x4]
|
|
||||||
fmul v0.8h, v4.8h, v20.8h
|
|
||||||
fmul v1.8h, v5.8h, v21.8h
|
|
||||||
fmul v2.8h, v6.8h, v22.8h
|
|
||||||
ld1 {v20.8h, v21.8h, v22.8h}, [x5]
|
|
||||||
|
|
||||||
fmla v0.8h, v16.8h, v20.8h
|
|
||||||
fmla v1.8h, v17.8h, v21.8h
|
|
||||||
fmla v2.8h, v18.8h, v22.8h
|
|
||||||
|
|
||||||
ld1 {v20.8h, v21.8h, v22.8h}, [x6]
|
|
||||||
fmla v0.8h, v28.8h, v20.8h
|
|
||||||
fmla v1.8h, v29.8h, v21.8h
|
|
||||||
fadd v0.8h, v1.8h, v0.8h
|
|
||||||
|
|
||||||
fmla v2.8h, v30.8h, v22.8h
|
|
||||||
fadd v0.8h, v0.8h, v2.8h
|
|
||||||
|
|
||||||
fadd v0.8h, v0.8h, v8.8h
|
|
||||||
|
|
||||||
fmin v0.8h, v0.8h, v10.8h
|
|
||||||
|
|
||||||
fmax v0.8h, v0.8h, v9.8h
|
|
||||||
st1 {v0.8h}, [x2]
|
|
||||||
End:
|
|
||||||
|
|
||||||
ldp d8, d9, [sp, #16]
|
|
||||||
ldp d10, d11, [sp], #32
|
|
||||||
|
|
||||||
ret
|
|
||||||
#endif
|
|
|
@ -1,56 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvDwF23SourceTransUnitFP16.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/4.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
#ifdef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvDwF23SourceTransUnitFP16
|
|
||||||
// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
|
|
||||||
|
|
||||||
//Auto:
|
|
||||||
//x0: source, x1:dest, x2:unit
|
|
||||||
|
|
||||||
L1:
|
|
||||||
cmp x2, #0
|
|
||||||
beq End
|
|
||||||
|
|
||||||
ld1 {v16.8h, v17.8h}, [x0], #32
|
|
||||||
ld1 {v18.8h, v19.8h}, [x0], #32
|
|
||||||
subs x2, x2, #1
|
|
||||||
fsub v0.8h, v16.8h, v18.8h
|
|
||||||
fadd v1.8h, v17.8h, v18.8h
|
|
||||||
beq L1LoopEnd
|
|
||||||
|
|
||||||
L1Loop:
|
|
||||||
fsub v2.8h, v18.8h, v17.8h
|
|
||||||
st1 {v0.8h, v1.8h}, [x1], #32
|
|
||||||
fsub v3.8h, v19.8h, v17.8h
|
|
||||||
mov v16.16b, v18.16b
|
|
||||||
st1 {v2.8h, v3.8h}, [x1], #32
|
|
||||||
mov v17.16b, v19.16b
|
|
||||||
ld1 {v18.8h, v19.8h}, [x0], #32
|
|
||||||
fsub v0.8h, v16.8h, v18.8h
|
|
||||||
fadd v1.8h, v17.8h, v18.8h
|
|
||||||
|
|
||||||
subs x2, x2, #1
|
|
||||||
bne L1Loop
|
|
||||||
L1LoopEnd:
|
|
||||||
fsub v2.8h, v18.8h, v17.8h
|
|
||||||
fsub v3.8h, v19.8h, v17.8h
|
|
||||||
|
|
||||||
st1 {v0.8h, v1.8h}, [x1], #32
|
|
||||||
st1 {v2.8h, v3.8h}, [x1], #32
|
|
||||||
|
|
||||||
|
|
||||||
End:
|
|
||||||
ret
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -15,17 +15,24 @@
|
||||||
|
|
||||||
asm_function MNNConvRunForLineDepthwiseFP16
|
asm_function MNNConvRunForLineDepthwiseFP16
|
||||||
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
|
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
|
||||||
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
|
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
|
||||||
|
// const float* bias, float* parameters)
|
||||||
|
|
||||||
//Auto Load:
|
//Auto Load:
|
||||||
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
|
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
|
||||||
|
|
||||||
//Load From sp:
|
//Load From sp:
|
||||||
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
|
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters
|
||||||
ldr x8, [sp, #0]
|
ldr x8, [sp, #0]
|
||||||
ldr x15, [sp, #8]
|
ldr x15, [sp, #8]
|
||||||
ldr x10, [sp, #16]
|
ldr x10, [sp, #16]
|
||||||
ldr x11, [sp, #24]
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
ldr x13, [sp, #40]
|
||||||
|
|
||||||
|
stp d8, d9, [sp, #(-16 * 3)]!
|
||||||
|
stp d10, d11, [sp, #(16 * 2)]
|
||||||
|
stp x19, x20, [sp, #(16 * 1)]
|
||||||
|
|
||||||
mov x9, #2 // sizeof(FLOAT16)
|
mov x9, #2 // sizeof(FLOAT16)
|
||||||
mul x4, x9, x4
|
mul x4, x9, x4
|
||||||
|
@ -34,15 +41,30 @@ mul x8, x9, x8
|
||||||
mul x10, x9, x10
|
mul x10, x9, x10
|
||||||
mul x11, x9, x11
|
mul x11, x9, x11
|
||||||
|
|
||||||
|
ld1 {v8.8h}, [x12] // bias
|
||||||
|
ld1r {v10.8h}, [x13], #2 // min
|
||||||
|
ld1r {v11.8h}, [x13]
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
||||||
mul x9, x5, x7
|
mul x9, x5, x7
|
||||||
sub x8, x8, x9
|
sub x8, x8, x9
|
||||||
|
|
||||||
.macro zero_vec x0, x1, x2, x3
|
.macro assign_bias x0, x1, x2, x3
|
||||||
movi \x0\().8h, #0
|
mov \x0\().16b, v8.16b
|
||||||
movi \x1\().8h, #0
|
mov \x1\().16b, v8.16b
|
||||||
movi \x2\().8h, #0
|
mov \x2\().16b, v8.16b
|
||||||
movi \x3\().8h, #0
|
mov \x3\().16b, v8.16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
|
||||||
|
fmax \x0\().8h, \x0\().8h, \xmin\().8h
|
||||||
|
fmax \x1\().8h, \x1\().8h, \xmin\().8h
|
||||||
|
fmax \x2\().8h, \x2\().8h, \xmin\().8h
|
||||||
|
fmax \x3\().8h, \x3\().8h, \xmin\().8h
|
||||||
|
fmin \x0\().8h, \x0\().8h, \xmax\().8h
|
||||||
|
fmin \x1\().8h, \x1\().8h, \xmax\().8h
|
||||||
|
fmin \x2\().8h, \x2\().8h, \xmax\().8h
|
||||||
|
fmin \x3\().8h, \x3\().8h, \xmax\().8h
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
LoopDY:
|
LoopDY:
|
||||||
|
@ -56,16 +78,16 @@ L16:
|
||||||
cmp x3, #16
|
cmp x3, #16
|
||||||
blt L8
|
blt L8
|
||||||
|
|
||||||
mov x12, #16
|
mov x19, #16
|
||||||
mul x12, x4, x12
|
mul x19, x4, x19
|
||||||
|
|
||||||
L16Loop:
|
L16Loop:
|
||||||
zero_vec v16, v17, v18, v19
|
assign_bias v16, v17, v18, v19
|
||||||
zero_vec v20, v21, v22, v23
|
assign_bias v20, v21, v22, v23
|
||||||
zero_vec v24, v25, v26, v27
|
assign_bias v24, v25, v26, v27
|
||||||
zero_vec v28, v29, v30, v31
|
assign_bias v28, v29, v30, v31
|
||||||
|
|
||||||
mov x13, x1
|
mov x20, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
mov x9, x6
|
mov x9, x6
|
||||||
L16LoopH:
|
L16LoopH:
|
||||||
|
@ -106,7 +128,7 @@ L16Loop:
|
||||||
ld1 {v3.8h}, [x1], x4
|
ld1 {v3.8h}, [x1], x4
|
||||||
fmla v30.8h, v7.8h, v2.8h
|
fmla v30.8h, v7.8h, v2.8h
|
||||||
fmla v31.8h, v7.8h, v3.8h
|
fmla v31.8h, v7.8h, v3.8h
|
||||||
sub x1, x1, x12
|
sub x1, x1, x19
|
||||||
add x1, x1, x7
|
add x1, x1, x7
|
||||||
|
|
||||||
bne L16LoopW
|
bne L16LoopW
|
||||||
|
@ -115,8 +137,12 @@ L16Loop:
|
||||||
bne L16LoopH
|
bne L16LoopH
|
||||||
|
|
||||||
sub x3, x3, #16
|
sub x3, x3, #16
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
compare_min_max v24, v25, v26, v27, v10, v11
|
||||||
|
compare_min_max v28, v29, v30, v31, v10, v11
|
||||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x20, x19
|
||||||
cmp x3, #16
|
cmp x3, #16
|
||||||
mov x2, x14
|
mov x2, x14
|
||||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||||
|
@ -129,14 +155,14 @@ L8:
|
||||||
cmp x3, #7
|
cmp x3, #7
|
||||||
ble L4
|
ble L4
|
||||||
|
|
||||||
mov x12, #8
|
mov x19, #8
|
||||||
mul x12, x4, x12
|
mul x19, x4, x19
|
||||||
|
|
||||||
L8Loop:
|
L8Loop:
|
||||||
zero_vec v16, v17, v18, v19
|
assign_bias v16, v17, v18, v19
|
||||||
zero_vec v20, v21, v22, v23
|
assign_bias v20, v21, v22, v23
|
||||||
|
|
||||||
mov x13, x1
|
mov x20, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
mov x9, x6
|
mov x9, x6
|
||||||
L8LoopH:
|
L8LoopH:
|
||||||
|
@ -161,7 +187,7 @@ L8Loop:
|
||||||
ld1 {v1.8h}, [x1], x4
|
ld1 {v1.8h}, [x1], x4
|
||||||
fmla v23.8h, v1.8h, v3.8h
|
fmla v23.8h, v1.8h, v3.8h
|
||||||
|
|
||||||
sub x1, x1, x12
|
sub x1, x1, x19
|
||||||
add x1, x1, x7
|
add x1, x1, x7
|
||||||
|
|
||||||
bne L8LoopW
|
bne L8LoopW
|
||||||
|
@ -169,9 +195,12 @@ L8Loop:
|
||||||
add x1, x1, x8
|
add x1, x1, x8
|
||||||
bne L8LoopH
|
bne L8LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
|
||||||
sub x3, x3, #8
|
sub x3, x3, #8
|
||||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x20, x19
|
||||||
mov x2, x14
|
mov x2, x14
|
||||||
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||||
|
|
||||||
|
@ -180,13 +209,13 @@ L4:
|
||||||
cmp x3, #4
|
cmp x3, #4
|
||||||
ble L1
|
ble L1
|
||||||
|
|
||||||
mov x12, #4
|
mov x19, #4
|
||||||
mul x12, x4, x12
|
mul x19, x4, x19
|
||||||
|
|
||||||
L4Loop:
|
L4Loop:
|
||||||
zero_vec v16, v17, v18, v19
|
assign_bias v16, v17, v18, v19
|
||||||
|
|
||||||
mov x13, x1
|
mov x20, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
mov x9, x6
|
mov x9, x6
|
||||||
L4LoopH:
|
L4LoopH:
|
||||||
|
@ -203,7 +232,7 @@ L4Loop:
|
||||||
ld1 {v1.8h}, [x1], x4
|
ld1 {v1.8h}, [x1], x4
|
||||||
fmla v19.8h, v1.8h, v3.8h
|
fmla v19.8h, v1.8h, v3.8h
|
||||||
|
|
||||||
sub x1, x1, x12
|
sub x1, x1, x19
|
||||||
add x1, x1, x7
|
add x1, x1, x7
|
||||||
|
|
||||||
bne L4LoopW
|
bne L4LoopW
|
||||||
|
@ -211,9 +240,10 @@ L4Loop:
|
||||||
add x1, x1, x8
|
add x1, x1, x8
|
||||||
bne L4LoopH
|
bne L4LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
sub x3, x3, #4
|
sub x3, x3, #4
|
||||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x20, x19
|
||||||
mov x2, x14
|
mov x2, x14
|
||||||
|
|
||||||
L1:
|
L1:
|
||||||
|
@ -221,10 +251,10 @@ cmp x3, #0
|
||||||
beq End
|
beq End
|
||||||
|
|
||||||
L1Loop:
|
L1Loop:
|
||||||
movi v0.8h, #0
|
mov v0.16b, v8.16b
|
||||||
mov x9, x6
|
mov x9, x6
|
||||||
mov x11, x1
|
mov x11, x1
|
||||||
mov x12, x2
|
mov x19, x2
|
||||||
L1LoopH:
|
L1LoopH:
|
||||||
mov x10, x5
|
mov x10, x5
|
||||||
L1LoopW:
|
L1LoopW:
|
||||||
|
@ -238,8 +268,10 @@ L1Loop:
|
||||||
bne L1LoopH
|
bne L1LoopH
|
||||||
|
|
||||||
subs x3, x3, #1
|
subs x3, x3, #1
|
||||||
|
fmax v0.8h, v0.8h, v10.8h
|
||||||
|
fmin v0.8h, v0.8h, v11.8h
|
||||||
st1 {v0.8h}, [x0], #16
|
st1 {v0.8h}, [x0], #16
|
||||||
mov x2, x12
|
mov x2, x19
|
||||||
add x1, x11, x4
|
add x1, x11, x4
|
||||||
bne L1Loop
|
bne L1Loop
|
||||||
|
|
||||||
|
@ -257,7 +289,9 @@ add x0, x0, x11
|
||||||
add x1, x1, x10
|
add x1, x1, x10
|
||||||
bne LoopDY
|
bne LoopDY
|
||||||
|
|
||||||
|
ldp x19, x20, [sp, #(16 * 1)]
|
||||||
|
ldp d10, d11, [sp, #(16 * 2)]
|
||||||
|
ldp d8, d9, [sp], #(16 * 3)
|
||||||
ret
|
ret
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,290 @@
|
||||||
|
//
|
||||||
|
// MNNDepthwiseConvFastKernelFP16.S
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2024/09/18.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
#include "MNNAsmGlobal.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
asm_function MNNDepthwiseConvFastKernelFP16
|
||||||
|
|
||||||
|
// void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
// size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
|
//Auto Load:
|
||||||
|
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
|
||||||
|
|
||||||
|
//Load From sp:
|
||||||
|
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
|
||||||
|
ldr x8, [sp, #0]
|
||||||
|
ldr x15, [sp, #8]
|
||||||
|
ldr x10, [sp, #16]
|
||||||
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
ldr x13, [sp, #40]
|
||||||
|
|
||||||
|
stp d14, d15, [sp, #(-16 * 9)]!
|
||||||
|
stp d12, d13, [sp, #(16 * 1)]
|
||||||
|
stp d10, d11, [sp, #(16 * 2)]
|
||||||
|
stp d8, d9, [sp, #(16 * 3)]
|
||||||
|
stp x21, x22, [sp, #(16 * 4)]
|
||||||
|
stp x19, x20, [sp, #(16 * 5)]
|
||||||
|
stp x27, x28, [sp, #(16 * 6)]
|
||||||
|
stp x25, x26, [sp, #(16 * 7)]
|
||||||
|
stp x23, x24, [sp, #(16 * 8)]
|
||||||
|
|
||||||
|
lsl x4, x4, #1 // src_w_step*sizeof(float)
|
||||||
|
lsl x7, x7, #1 // dilate_x_step*sizeof(float)
|
||||||
|
lsl x8, x8, #1 // dilate_y_step*sizeof(float)
|
||||||
|
lsl x23, x10, #1 // srcHStep*sizeof(float)
|
||||||
|
lsl x24, x11, #1 // dstHStep*sizeof(float)
|
||||||
|
mov x20, x12 // bias
|
||||||
|
mov x26, x13 // min
|
||||||
|
add x27, x13, #2 // max
|
||||||
|
|
||||||
|
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
||||||
|
mul x9, x5, x7
|
||||||
|
sub x8, x8, x9
|
||||||
|
mov x25, x3 // width
|
||||||
|
.macro assign_bias x0, x1, x2, x3, bv
|
||||||
|
mov \x0\().16b, \bv\().16b
|
||||||
|
mov \x1\().16b, \bv\().16b
|
||||||
|
mov \x2\().16b, \bv\().16b
|
||||||
|
mov \x3\().16b, \bv\().16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
|
||||||
|
fmax \x0\().8h, \x0\().8h, \xmin\().8h
|
||||||
|
fmax \x1\().8h, \x1\().8h, \xmin\().8h
|
||||||
|
fmax \x2\().8h, \x2\().8h, \xmin\().8h
|
||||||
|
fmax \x3\().8h, \x3\().8h, \xmin\().8h
|
||||||
|
fmin \x0\().8h, \x0\().8h, \xmax\().8h
|
||||||
|
fmin \x1\().8h, \x1\().8h, \xmax\().8h
|
||||||
|
fmin \x2\().8h, \x2\().8h, \xmax\().8h
|
||||||
|
fmin \x3\().8h, \x3\().8h, \xmax\().8h
|
||||||
|
.endm
|
||||||
|
|
||||||
|
LoopDY:
|
||||||
|
//mov x23, x10
|
||||||
|
//mov x24, x11
|
||||||
|
mov x21, x0
|
||||||
|
mov x22, x1
|
||||||
|
|
||||||
|
L16:
|
||||||
|
cmp x3, #16
|
||||||
|
blt L8
|
||||||
|
|
||||||
|
mov x12, #-176
|
||||||
|
mov x19, #256
|
||||||
|
|
||||||
|
L16Loop:
|
||||||
|
ld1 {v8.8h}, [x20] // load bias
|
||||||
|
assign_bias v16, v17, v18, v19, v8
|
||||||
|
assign_bias v20, v21, v22, v23, v8
|
||||||
|
assign_bias v24, v25, v26, v27, v8
|
||||||
|
assign_bias v28, v29, v30, v31, v8
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L16LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L16LoopW:
|
||||||
|
ld1 {v8.8h}, [x2], #16
|
||||||
|
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||||
|
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
|
||||||
|
ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.8h, v8.8h, v0.8h
|
||||||
|
fmla v17.8h, v8.8h, v1.8h
|
||||||
|
fmla v18.8h, v8.8h, v2.8h
|
||||||
|
fmla v19.8h, v8.8h, v3.8h
|
||||||
|
|
||||||
|
fmla v20.8h, v8.8h, v4.8h
|
||||||
|
fmla v21.8h, v8.8h, v5.8h
|
||||||
|
fmla v22.8h, v8.8h, v6.8h
|
||||||
|
fmla v23.8h, v8.8h, v7.8h
|
||||||
|
|
||||||
|
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
|
||||||
|
|
||||||
|
fmla v24.8h, v8.8h, v9.8h
|
||||||
|
fmla v25.8h, v8.8h, v10.8h
|
||||||
|
fmla v26.8h, v8.8h, v11.8h
|
||||||
|
fmla v27.8h, v8.8h, v12.8h
|
||||||
|
|
||||||
|
fmla v28.8h, v8.8h, v0.8h
|
||||||
|
fmla v29.8h, v8.8h, v1.8h
|
||||||
|
fmla v30.8h, v8.8h, v2.8h
|
||||||
|
fmla v31.8h, v8.8h, v3.8h
|
||||||
|
|
||||||
|
bne L16LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L16LoopH
|
||||||
|
ld1r {v10.8h}, [x26] // min
|
||||||
|
ld1r {v11.8h}, [x27] // max
|
||||||
|
sub x3, x3, #16
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
compare_min_max v24, v25, v26, v27, v10, v11
|
||||||
|
compare_min_max v28, v29, v30, v31, v10, v11
|
||||||
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
|
add x1, x13, x19 // 16 * pack * sizeof(float)
|
||||||
|
cmp x3, #16
|
||||||
|
mov x2, x14
|
||||||
|
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||||
|
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
|
||||||
|
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
|
||||||
|
bge L16Loop
|
||||||
|
|
||||||
|
|
||||||
|
L8:
|
||||||
|
ld1r {v10.8h}, [x26] // min
|
||||||
|
ld1r {v11.8h}, [x27] // max
|
||||||
|
ld1 {v24.8h}, [x20] // load bias
|
||||||
|
cmp x3, #7
|
||||||
|
ble L4
|
||||||
|
|
||||||
|
mov x12, #-48
|
||||||
|
mov x19, #128
|
||||||
|
|
||||||
|
L8Loop:
|
||||||
|
assign_bias v16, v17, v18, v19, v24
|
||||||
|
assign_bias v20, v21, v22, v23, v24
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L8LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L8LoopW:
|
||||||
|
ld1 {v8.8h}, [x2], #16
|
||||||
|
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
|
||||||
|
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.8h, v8.8h, v0.8h
|
||||||
|
fmla v17.8h, v8.8h, v1.8h
|
||||||
|
fmla v18.8h, v8.8h, v2.8h
|
||||||
|
fmla v19.8h, v8.8h, v3.8h
|
||||||
|
|
||||||
|
fmla v20.8h, v8.8h, v4.8h
|
||||||
|
fmla v21.8h, v8.8h, v5.8h
|
||||||
|
fmla v22.8h, v8.8h, v6.8h
|
||||||
|
fmla v23.8h, v8.8h, v7.8h
|
||||||
|
|
||||||
|
bne L8LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L8LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
sub x3, x3, #8
|
||||||
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
|
add x1, x13, x19 // 8 * pack * sizeof(float)
|
||||||
|
mov x2, x14
|
||||||
|
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
|
||||||
|
|
||||||
|
|
||||||
|
L4:
|
||||||
|
cmp x3, #4
|
||||||
|
ble L1
|
||||||
|
|
||||||
|
mov x12, #16
|
||||||
|
mov x19, #64
|
||||||
|
|
||||||
|
L4Loop:
|
||||||
|
assign_bias v16, v17, v18, v19, v24
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L4LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L4LoopW:
|
||||||
|
ld1 {v8.8h}, [x2], #16
|
||||||
|
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.8h, v8.8h, v0.8h
|
||||||
|
fmla v17.8h, v8.8h, v1.8h
|
||||||
|
fmla v18.8h, v8.8h, v2.8h
|
||||||
|
fmla v19.8h, v8.8h, v3.8h
|
||||||
|
|
||||||
|
bne L4LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L4LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
sub x3, x3, #4
|
||||||
|
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
|
||||||
|
add x1, x13, x19
|
||||||
|
mov x2, x14
|
||||||
|
|
||||||
|
L1:
|
||||||
|
cmp x3, #0
|
||||||
|
beq End
|
||||||
|
|
||||||
|
mov x19, #16
|
||||||
|
|
||||||
|
L1Loop:
|
||||||
|
ld1 {v16.8h}, [x20] // assign bias
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L1LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L1LoopW:
|
||||||
|
ld1 {v8.8h}, [x2], #16
|
||||||
|
ld1 {v0.8h}, [x1], #16
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.8h, v8.8h, v0.8h
|
||||||
|
|
||||||
|
bne L1LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L1LoopH
|
||||||
|
|
||||||
|
subs x3, x3, #1
|
||||||
|
fmax v16.8h, v16.8h, v10.8h
|
||||||
|
fmin v16.8h, v16.8h, v11.8h
|
||||||
|
st1 {v16.8h}, [x0], #16
|
||||||
|
add x1, x13, x4
|
||||||
|
mov x2, x14
|
||||||
|
bne L1Loop
|
||||||
|
|
||||||
|
|
||||||
|
End:
|
||||||
|
|
||||||
|
//mov x10, x23
|
||||||
|
//mov x11, x24
|
||||||
|
//mov x0, x21
|
||||||
|
//mov x1, x22
|
||||||
|
mov x3, x25
|
||||||
|
|
||||||
|
subs x15, x15, #1
|
||||||
|
add x0, x21, x24
|
||||||
|
add x1, x22, x23
|
||||||
|
bne LoopDY
|
||||||
|
|
||||||
|
ldp x23, x24, [sp, #(16 * 8)]
|
||||||
|
ldp x25, x26, [sp, #(16 * 7)]
|
||||||
|
ldp x27, x28, [sp, #(16 * 6)]
|
||||||
|
ldp x19, x20, [sp, #(16 * 5)]
|
||||||
|
ldp x21, x22, [sp, #(16 * 4)]
|
||||||
|
ldp d8, d9, [sp, #(16 * 3)]
|
||||||
|
ldp d10, d11, [sp, #(16 * 2)]
|
||||||
|
ldp d12, d13, [sp, #(16 * 1)]
|
||||||
|
ldp d14, d15, [sp], #(16 * 9)
|
||||||
|
ret
|
||||||
|
|
||||||
|
#endif
|
|
@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)]
|
||||||
ldr x25, [x6, #40] // xKernelSum
|
ldr x25, [x6, #40] // xKernelSum
|
||||||
ldr x26, [x6, #48] // weightQuantBias
|
ldr x26, [x6, #48] // weightQuantBias
|
||||||
ldr x23, [x6, #56] // fp32minmax
|
ldr x23, [x6, #56] // fp32minmax
|
||||||
ldr x27, [x6, #64] // blockNum
|
|
||||||
|
|
||||||
//add x24, x23, #4
|
//add x24, x23, #4
|
||||||
|
|
||||||
mov x21, #16 // sizeof(float16_t) * PACK
|
mov x21, #16 // sizeof(float16_t) * PACK
|
||||||
mul x27, x27, x3
|
|
||||||
Start:
|
Start:
|
||||||
lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
|
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
|
||||||
mov x22, #48 // src_steps
|
mov x22, #48 // src_steps
|
||||||
ldr x27, [x6, #80] // extra scale
|
ldr x27, [x6, #80] // extra scale
|
||||||
TILE_12:
|
TILE_12:
|
||||||
|
|
|
@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)]
|
||||||
ldr x25, [x6, #40] // xKernelSum
|
ldr x25, [x6, #40] // xKernelSum
|
||||||
ldr x26, [x6, #48] // weightQuantBias
|
ldr x26, [x6, #48] // weightQuantBias
|
||||||
ldr x23, [x6, #56] // fp32minmax
|
ldr x23, [x6, #56] // fp32minmax
|
||||||
ldr x27, [x6, #64] // blockNum
|
|
||||||
|
|
||||||
mov x21, #16 // sizeof(float16_t) * PACK
|
mov x21, #16 // sizeof(float16_t) * PACK
|
||||||
mul x27, x27, x3
|
|
||||||
Start:
|
Start:
|
||||||
lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
|
lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
|
||||||
mov x22, #48 // src_steps
|
mov x22, #48 // src_steps
|
||||||
ldr x27, [x6, #80] // extra scale
|
ldr x27, [x6, #80] // extra scale
|
||||||
TILE_12:
|
TILE_12:
|
||||||
|
|
|
@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)]
|
||||||
// ldr w23, [x6, #24]
|
// ldr w23, [x6, #24]
|
||||||
ldr x27, [x6, #40] // srcKernelSum
|
ldr x27, [x6, #40] // srcKernelSum
|
||||||
ldr x28, [x6, #48] // weightQuanBias
|
ldr x28, [x6, #48] // weightQuanBias
|
||||||
ldr x23, [x6, #64] // blockNum
|
|
||||||
ldr x14, [x6, #56] // fp32minmax
|
ldr x14, [x6, #56] // fp32minmax
|
||||||
|
|
||||||
mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
|
|
||||||
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
|
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
|
||||||
mov x21, #16 // sizeof(float16_t) * UNIT
|
mov x21, #16 // sizeof(float16_t) * UNIT
|
||||||
|
|
||||||
Start:
|
Start:
|
||||||
lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
|
lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
|
||||||
ldr x23, [x6, #80] // extra scale
|
ldr x23, [x6, #80] // extra scale
|
||||||
TILE_10:
|
TILE_10:
|
||||||
cmp x7, #10
|
cmp x7, #10
|
||||||
|
|
|
@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)]
|
||||||
// ldr w23, [x6, #24]
|
// ldr w23, [x6, #24]
|
||||||
ldr x27, [x6, #40] // srcKernelSum
|
ldr x27, [x6, #40] // srcKernelSum
|
||||||
ldr x28, [x6, #48] // weightQuanBias
|
ldr x28, [x6, #48] // weightQuanBias
|
||||||
ldr x23, [x6, #64] // blockNum
|
|
||||||
ldr x14, [x6, #56] // fp32minmax
|
ldr x14, [x6, #56] // fp32minmax
|
||||||
|
|
||||||
mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
|
|
||||||
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
|
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
|
||||||
mov x21, #16 // sizeof(float16_t) * UNIT
|
mov x21, #16 // sizeof(float16_t) * UNIT
|
||||||
|
|
||||||
Start:
|
Start:
|
||||||
lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
|
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
|
||||||
ldr x23, [x6, #80] // extra scale
|
ldr x23, [x6, #80] // extra scale
|
||||||
TILE_10:
|
TILE_10:
|
||||||
cmp x7, #10
|
cmp x7, #10
|
||||||
|
|
|
@ -42,9 +42,11 @@ ENDIF()
|
||||||
|
|
||||||
# ARM82 Assemblies
|
# ARM82 Assemblies
|
||||||
IF(MNN_ARM82)
|
IF(MNN_ARM82)
|
||||||
target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
|
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
|
||||||
include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
|
target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
|
||||||
list(APPEND MNN_TARGETS MNN_Arm82)
|
include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
|
||||||
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
|
list(APPEND MNN_TARGETS MNN_Arm82)
|
||||||
|
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
|
||||||
|
ENDIF()
|
||||||
ENDIF()
|
ENDIF()
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
|
||||||
CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
|
CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
void CPURuntime::computeDivideSizes(int size, int* dst) const {
|
void CPUBackend::computeDivideSizes(int size, int* dst) const {
|
||||||
if (mGroupWithComputeRate.size() <= 1) {
|
if (mGroupWithComputeRate.size() <= 1) {
|
||||||
// Avg divide
|
// Avg divide
|
||||||
int length = UP_DIV(size, mThreadNumber);
|
int length = UP_DIV(size, mThreadNumber);
|
||||||
|
@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void CPURuntime::_resetGroupCompute() const {
|
|
||||||
if (mPastDecreaseHint == hint().cpuDecreaseRate) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
mGroupWithComputeRate.clear();
|
|
||||||
if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
mPastDecreaseHint = hint().cpuDecreaseRate;
|
|
||||||
auto cpuInfo = MNNGetCPUInfo();
|
|
||||||
if (cpuInfo->groups.size() < 2) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f;
|
|
||||||
int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
|
|
||||||
int groupIndex = (int)cpuInfo->groups.size()-2;
|
|
||||||
float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
|
|
||||||
validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
|
|
||||||
float totalComputeRate = 1.0f * validCpuSize;
|
|
||||||
mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
|
|
||||||
float currentRate = 1.0f;
|
|
||||||
while (validCpuSize < mThreadNumber && groupIndex >= 0) {
|
|
||||||
auto& group = cpuInfo->groups[groupIndex];
|
|
||||||
int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
|
|
||||||
validCpuSize += group.ids.size();
|
|
||||||
currentRate *= decreaseRate;
|
|
||||||
totalComputeRate += currentRate * selectSize;
|
|
||||||
mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
|
|
||||||
}
|
|
||||||
for (auto& g : mGroupWithComputeRate) {
|
|
||||||
g.first = g.first / totalComputeRate;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void CPURuntime::_resetThreadPool() {
|
void CPURuntime::_resetThreadPool() {
|
||||||
mThreadNumber = std::max(1, mThreadNumber);
|
mThreadNumber = std::max(1, mThreadNumber);
|
||||||
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
||||||
|
@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() {
|
||||||
}
|
}
|
||||||
mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
|
mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
|
||||||
}
|
}
|
||||||
mGroupWithComputeRate.clear();
|
|
||||||
if (mThreadNumber > 1) {
|
if (mThreadNumber > 1) {
|
||||||
mTaskIndex = ThreadPool::acquireWorkIndex();
|
mTaskIndex = ThreadPool::acquireWorkIndex();
|
||||||
if (-1 == mTaskIndex) {
|
if (-1 == mTaskIndex) {
|
||||||
|
@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
|
||||||
}
|
}
|
||||||
mThreadNumber = numberThread;
|
mThreadNumber = numberThread;
|
||||||
_resetThreadPool();
|
_resetThreadPool();
|
||||||
// Mask Group Compute reset
|
|
||||||
mPastDecreaseHint = -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
CPURuntime::CPURuntime(const Backend::Info& info) {
|
CPURuntime::CPURuntime(const Backend::Info& info) {
|
||||||
|
@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
|
||||||
auto cpuBn = static_cast<CPUBackend*>(origin);
|
auto cpuBn = static_cast<CPUBackend*>(origin);
|
||||||
mSharedDmaInfo = cpuBn->mDmaInfo;
|
mSharedDmaInfo = cpuBn->mDmaInfo;
|
||||||
}
|
}
|
||||||
_resetGroupCompute();
|
|
||||||
if (nullptr != config) {
|
if (nullptr != config) {
|
||||||
precision = config->precision;
|
precision = config->precision;
|
||||||
flags = config->flags;
|
flags = config->flags;
|
||||||
|
@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
|
||||||
#endif
|
#endif
|
||||||
mMemory = memory;
|
mMemory = memory;
|
||||||
mRuntime = const_cast<CPURuntime*>(runtime);
|
mRuntime = const_cast<CPURuntime*>(runtime);
|
||||||
|
mThreadNumber = mRuntime->mThreadNumber;
|
||||||
|
// Compute Group Rate
|
||||||
|
do {
|
||||||
|
if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto rate = mRuntime->hint().cpuDecreaseRate;
|
||||||
|
if (rate >= 100 || rate <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto cpuInfo = MNNGetCPUInfo();
|
||||||
|
if (cpuInfo->groups.size() < 2) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
mGroupWithComputeRate.clear();
|
||||||
|
float decreaseRate = (float)(rate) / 100.0f;
|
||||||
|
int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
|
||||||
|
int groupIndex = (int)cpuInfo->groups.size()-2;
|
||||||
|
float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
|
||||||
|
validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
|
||||||
|
float totalComputeRate = 1.0f * validCpuSize;
|
||||||
|
mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
|
||||||
|
float currentRate = 1.0f;
|
||||||
|
while (validCpuSize < mThreadNumber && groupIndex >= 0) {
|
||||||
|
auto& group = cpuInfo->groups[groupIndex];
|
||||||
|
int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
|
||||||
|
validCpuSize += group.ids.size();
|
||||||
|
currentRate *= decreaseRate;
|
||||||
|
totalComputeRate += currentRate * selectSize;
|
||||||
|
mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
|
||||||
|
}
|
||||||
|
for (auto& g : mGroupWithComputeRate) {
|
||||||
|
g.first = g.first / totalComputeRate;
|
||||||
|
}
|
||||||
|
} while (false);
|
||||||
auto dynamicAlloc = mRuntime->mSharedDmaInfo;
|
auto dynamicAlloc = mRuntime->mSharedDmaInfo;
|
||||||
if (nullptr == dynamicAlloc.get()) {
|
if (nullptr == dynamicAlloc.get()) {
|
||||||
mDmaInfo.reset(new CPURuntime::DynamicAllocator);
|
mDmaInfo.reset(new CPURuntime::DynamicAllocator);
|
||||||
|
|
|
@ -40,9 +40,6 @@ public:
|
||||||
void onConcurrencyEnd() const;
|
void onConcurrencyEnd() const;
|
||||||
virtual bool onCheckInfo(Backend::Info& info) const override;
|
virtual bool onCheckInfo(Backend::Info& info) const override;
|
||||||
|
|
||||||
// dividedSize's length should be larger than threadNumber
|
|
||||||
void computeDivideSizes(int size, int* dst) const;
|
|
||||||
|
|
||||||
#ifdef MNN_USE_THREAD_POOL
|
#ifdef MNN_USE_THREAD_POOL
|
||||||
inline bool multiThreadValid() const {
|
inline bool multiThreadValid() const {
|
||||||
return mThreadOpen;
|
return mThreadOpen;
|
||||||
|
@ -60,9 +57,6 @@ private:
|
||||||
mutable int mTaskIndex = -1;
|
mutable int mTaskIndex = -1;
|
||||||
mutable bool mThreadOpen = false;
|
mutable bool mThreadOpen = false;
|
||||||
#endif
|
#endif
|
||||||
void _resetGroupCompute() const;
|
|
||||||
mutable std::vector<std::pair<float, int>> mGroupWithComputeRate;
|
|
||||||
mutable int mPastDecreaseHint = -1;
|
|
||||||
BackendConfig::MemoryMode mMemory;
|
BackendConfig::MemoryMode mMemory;
|
||||||
BackendConfig::PowerMode mPower;
|
BackendConfig::PowerMode mPower;
|
||||||
BackendConfig::PrecisionMode mPrecision;
|
BackendConfig::PrecisionMode mPrecision;
|
||||||
|
@ -108,6 +102,8 @@ public:
|
||||||
// Return sizeDivide, scheduleNumber aligned memory
|
// Return sizeDivide, scheduleNumber aligned memory
|
||||||
std::pair<int, int> multiThreadDivide(int size) const;
|
std::pair<int, int> multiThreadDivide(int size) const;
|
||||||
virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
|
virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
|
||||||
|
// dividedSize's length should be larger than threadNumber
|
||||||
|
void computeDivideSizes(int size, int* dst) const;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
|
virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
|
||||||
|
@ -145,7 +141,7 @@ public:
|
||||||
static bool addCreator(OpType t, Creator* c);
|
static bool addCreator(OpType t, Creator* c);
|
||||||
|
|
||||||
inline int threadNumber() const {
|
inline int threadNumber() const {
|
||||||
return mRuntime->mThreadNumber;
|
return mThreadNumber;
|
||||||
}
|
}
|
||||||
#ifdef MNN_USE_THREAD_POOL
|
#ifdef MNN_USE_THREAD_POOL
|
||||||
inline bool threadOpen() const {
|
inline bool threadOpen() const {
|
||||||
|
@ -182,6 +178,9 @@ protected:
|
||||||
CoreFunctions* mCoreFunctions;
|
CoreFunctions* mCoreFunctions;
|
||||||
CoreInt8Functions* mInt8CoreFunctions;
|
CoreInt8Functions* mInt8CoreFunctions;
|
||||||
private:
|
private:
|
||||||
|
int mThreadNumber;
|
||||||
|
std::vector<std::pair<float, int>> mGroupWithComputeRate;
|
||||||
|
|
||||||
std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
|
std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
|
||||||
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
||||||
CPURuntime* mRuntime;
|
CPURuntime* mRuntime;
|
||||||
|
|
|
@ -14,7 +14,6 @@
|
||||||
#include "core/TensorUtils.hpp"
|
#include "core/TensorUtils.hpp"
|
||||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||||
#include "backend/cpu/compute/ConvOpt.h"
|
#include "backend/cpu/compute/ConvOpt.h"
|
||||||
#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
|
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
|
CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
|
||||||
|
@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
int bytes = core->bytes;
|
int bytes = core->bytes;
|
||||||
int unit = core->pack;
|
int unit = core->pack;
|
||||||
auto unitFunc = core->MNNConvRunForUnitDepthWise;
|
auto kernelFunc = core->MNNConvRunForLineDepthwise;
|
||||||
auto lineFunc = core->MNNConvRunForLineDepthwise;
|
|
||||||
auto postFunc = core->MNNAxByClampBroadcastUnit;
|
auto postFunc = core->MNNAxByClampBroadcastUnit;
|
||||||
auto inputTensor = inputs[0];
|
auto inputTensor = inputs[0];
|
||||||
auto outputTensor = outputs[0];
|
auto outputTensor = outputs[0];
|
||||||
|
@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
|
||||||
int weight_z_step = kernel_height * kernel_width * unit;
|
int weight_z_step = kernel_height * kernel_width * unit;
|
||||||
int dilateY_step = dilateY * src_width * unit;
|
int dilateY_step = dilateY * src_width * unit;
|
||||||
int dilateX_step = dilateX * unit;
|
int dilateX_step = dilateX * unit;
|
||||||
// Compute Mid Rect
|
|
||||||
int l = 0, t = 0, r = dst_width, b = dst_height;
|
|
||||||
for (; l * strideX - padX < 0 && l < dst_width; l++) {
|
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
for (; t * strideY - padY < 0 && t < dst_height; t++) {
|
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
|
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
|
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
|
|
||||||
auto postData = getPostParameters();
|
|
||||||
auto batch = inputs[0]->batch();
|
auto batch = inputs[0]->batch();
|
||||||
int total = batch * dst_depth_quad;
|
int total = batch * dst_depth_quad;
|
||||||
int numberThread = ((CPUBackend*)backend())->threadNumber();
|
int numberThread = ((CPUBackend*)backend())->threadNumber();
|
||||||
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
|
|
||||||
auto runBasic = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
|
|
||||||
for (int dy = T; dy < B; ++dy) {
|
|
||||||
auto dst_y = dst_z + dy * dst_y_step * bytes;
|
|
||||||
int srcStartY = dy * strideY - padY;
|
|
||||||
const auto src_dy = src_z + srcStartY * src_y_step * bytes;
|
|
||||||
int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
|
|
||||||
int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
|
|
||||||
for (int dx = L; dx < R; ++dx) {
|
|
||||||
auto dst_x = dst_y + unit * dx * bytes;
|
|
||||||
int srcStartX = dx * strideX - padX;
|
|
||||||
const auto src_dx = src_dy + srcStartX * unit * bytes;
|
|
||||||
int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
|
|
||||||
int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
|
|
||||||
unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
|
|
||||||
(const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
|
|
||||||
unit * kernel_width, dilateX_step, dilateY_step);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
std::vector<int> divides(numberThread+1);
|
std::vector<int> divides(numberThread+1);
|
||||||
divides[0] = 0;
|
divides[0] = 0;
|
||||||
rt->computeDivideSizes(total, divides.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
|
||||||
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
|
mNumber = numberThread;
|
||||||
|
auto postData = getPostParameters();
|
||||||
|
if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
|
||||||
|
static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
|
||||||
|
}
|
||||||
|
mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel);
|
||||||
|
if (mFastKernelApply ) { // Only support ARM kernel
|
||||||
|
kernelFunc = core->MNNDepthwiseConvFastKernel;
|
||||||
|
}
|
||||||
|
auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon);
|
||||||
|
int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width;
|
||||||
|
int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height;
|
||||||
|
mInputPad.reset(Tensor::createDevice<float>({mNumber, paddedWidth * paddedHeight * unit}));
|
||||||
|
bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC);
|
||||||
|
if (!succ) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
if (paddedWidth != src_width) {
|
||||||
|
dilateY_step = dilateY * paddedWidth * unit;
|
||||||
|
src_y_step = paddedWidth * unit;
|
||||||
|
}
|
||||||
|
mExecutor = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
|
||||||
|
const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
|
||||||
|
::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
|
||||||
auto biasP = inputs[2]->host<uint8_t>();
|
auto biasP = inputs[2]->host<uint8_t>();
|
||||||
auto weightP = inputs[1]->host<uint8_t>();
|
auto weightP = inputs[1]->host<uint8_t>();
|
||||||
for (int index = divides[tId]; index < divides[tId+1]; ++index) {
|
for (int index = divides[tId]; index < divides[tId+1]; ++index) {
|
||||||
|
|
||||||
int dz = index / batch;
|
int dz = index / batch;
|
||||||
auto dst_z = dstOrigin + dst_z_step * index * bytes;
|
auto dstOrigin = outputPtr + dst_z_step * index * bytes;
|
||||||
const auto src_z = srcOrigin + src_z_step * index * bytes;
|
const auto srcOrigin = inputPtr + src_z_step * index * bytes;
|
||||||
auto bias_z = biasP + unit * dz * bytes;
|
auto bias_z = biasP + unit * dz * bytes;
|
||||||
const auto weight_dz = weightP + dz * weight_z_step * bytes;
|
const auto weight_dz = weightP + dz * weight_z_step * bytes;
|
||||||
runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
|
|
||||||
runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
|
auto srcPtr = srcOrigin;
|
||||||
runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
|
// Pad inputs
|
||||||
runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
|
for (int y = 0; y < src_height; ++y) {
|
||||||
if (r > l && b > t) {
|
auto src = srcOrigin + y * src_width * unit * bytes;
|
||||||
lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
|
auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes;
|
||||||
(const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
|
::memcpy(dst, src, src_width * unit * bytes);
|
||||||
(const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
|
|
||||||
dilateY_step, b - t, src_y_step * strideY, dst_y_step);
|
|
||||||
}
|
}
|
||||||
postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
|
|
||||||
|
// Compute
|
||||||
|
kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
mNumber = numberThread;
|
backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC);
|
||||||
|
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,11 +267,6 @@ public:
|
||||||
if (inputs.empty()) {
|
if (inputs.empty()) {
|
||||||
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
|
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
|
||||||
}
|
}
|
||||||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
|
||||||
if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 &&
|
|
||||||
conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) {
|
|
||||||
return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize);
|
|
||||||
}
|
|
||||||
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
|
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -26,7 +26,12 @@ public:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
|
std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
|
||||||
|
std::function<void(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
size_t srcHStep, size_t dstHStep)> mFastKernel;
|
||||||
int mNumber = 1;
|
int mNumber = 1;
|
||||||
|
std::shared_ptr<Tensor> mInputPad;
|
||||||
|
bool mFastKernelApply = false;
|
||||||
};
|
};
|
||||||
class MultiInputFloatExecution : public BasicFloatExecution {
|
class MultiInputFloatExecution : public BasicFloatExecution {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
|
||||||
|
|
||||||
int size_ = mMutableResource.mBiasInt32->length(0);
|
int size_ = mMutableResource.mBiasInt32->length(0);
|
||||||
if (core->ConvDepthwise3x3LineInt8_ARM82) {
|
if (core->ConvDepthwise3x3LineInt8_ARM82) {
|
||||||
if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) {
|
if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) {
|
||||||
mUse3x3Kernel = true;
|
mUse3x3Kernel = true;
|
||||||
mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
|
mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
|
||||||
UNIT = 4;
|
UNIT = 4;
|
||||||
|
@ -247,7 +247,7 @@ public:
|
||||||
|
|
||||||
if (core->ConvDepthwise3x3LineInt8_ARM82) {
|
if (core->ConvDepthwise3x3LineInt8_ARM82) {
|
||||||
if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
|
if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
|
||||||
&& common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
|
&& common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
|
||||||
use3x3kernel = true;
|
use3x3kernel = true;
|
||||||
UNIT = 4;
|
UNIT = 4;
|
||||||
}
|
}
|
||||||
|
|
|
@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
|
||||||
auto outW = outputTensor->buffer().dim[4].extent;
|
auto outW = outputTensor->buffer().dim[4].extent;
|
||||||
auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
|
auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
|
||||||
auto tileCount = outD;
|
auto tileCount = outD;
|
||||||
auto inOffset = batches * inH * inW * core->pack;
|
auto inOffset = batches * inD * inH * inW * core->pack;
|
||||||
auto outOffset = batches * outH * outW * core->pack;
|
auto outOffset = batches * outD * outH * outW * core->pack;
|
||||||
auto cordPtr = mTempCordBuffer->host<uint8_t>();
|
auto cordPtr = mTempCordBuffer->host<uint8_t>();
|
||||||
for (auto b = 0; b < batches; ++b) {
|
for (auto b = 0; b < batches; ++b) {
|
||||||
auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
|
auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
|
||||||
|
@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
|
||||||
// Compute cord
|
// Compute cord
|
||||||
MNN_CONCURRENCY_BEGIN(tId, threadCount) {
|
MNN_CONCURRENCY_BEGIN(tId, threadCount) {
|
||||||
for (int index=tId; index < tileCount; index += threadCount) {
|
for (int index=tId; index < tileCount; index += threadCount) {
|
||||||
auto c = index / outD;
|
auto d = index;
|
||||||
auto d = index % outD;
|
auto inputC = _inputPtr;
|
||||||
auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
|
auto outputC = _outputPtr;
|
||||||
auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
|
|
||||||
auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
|
auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
|
||||||
auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
|
auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
|
||||||
for (int h = 0; h < outH; h++) {
|
for (int h = 0; h < outH; h++) {
|
||||||
|
|
|
@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
|
||||||
}
|
}
|
||||||
group.ids = _readNumber((const char*)buffer.get(), buffer.size());
|
group.ids = _readNumber((const char*)buffer.get(), buffer.size());
|
||||||
}
|
}
|
||||||
|
if (group.ids.empty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
std::string minfreq = policyName + "/cpuinfo_min_freq";
|
std::string minfreq = policyName + "/cpuinfo_min_freq";
|
||||||
{
|
{
|
||||||
MNN::AutoStorage<uint8_t> buffer;
|
MNN::AutoStorage<uint8_t> buffer;
|
||||||
|
@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
|
||||||
_getInfoApple(cpuinfo_isa);
|
_getInfoApple(cpuinfo_isa);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__aarch64__) && defined(_WIN32)
|
||||||
|
cpuinfo_isa->fp16arith = true;
|
||||||
|
cpuinfo_isa->dot = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
|
MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig
|
||||||
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
|
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
|
||||||
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
|
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
|
||||||
}
|
}
|
||||||
return ((d * height + h) * width + w) * 4;
|
return ((d * height + h) * width + w) * PACK;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
|
static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
|
||||||
|
|
|
@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
|
||||||
if (MNN_SUPPORT_BF16)
|
if (MNN_SUPPORT_BF16)
|
||||||
target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
|
target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
|
||||||
endif()
|
endif()
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
|
||||||
message(STATUS "Enabling AArch64 Assemblies")
|
message(STATUS "Enabling AArch64 Assemblies")
|
||||||
add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
|
add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
|
||||||
target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
|
target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
|
||||||
|
@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
|
||||||
target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
|
target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(MNN_ARM82)
|
|
||||||
message(STATUS "Enable INT8 SDOT")
|
|
||||||
target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
else()
|
else()
|
||||||
# Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
|
# Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s
|
||||||
const float* postParameters, const float* bias, const float* k, const float* b);
|
const float* postParameters, const float* bias, const float* k, const float* b);
|
||||||
void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
|
void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
|
||||||
const float* postParameters, const float* bias, const float* k, const float* b);
|
const float* postParameters, const float* bias, const float* k, const float* b);
|
||||||
|
|
||||||
void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
|
||||||
void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
|
void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
|
||||||
size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
|
size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
|
||||||
size_t height, size_t srcHStep, size_t dstHStep);
|
size_t height, size_t srcHStep, size_t dstHStep);
|
||||||
|
|
|
@ -34,8 +34,16 @@ ldr r8, [sp, #48]
|
||||||
ldr lr, [sp, #52]
|
ldr lr, [sp, #52]
|
||||||
ldr r10, [sp, #56]
|
ldr r10, [sp, #56]
|
||||||
ldr r11, [sp, #60]
|
ldr r11, [sp, #60]
|
||||||
|
ldr r12, [sp, #64] // bias
|
||||||
|
vld1.32 {q0}, [r12] // bias
|
||||||
|
ldr r12, [sp, #68] // min,max
|
||||||
|
vld1.32 {d2[0]}, [r12]!
|
||||||
|
vld1.32 {d2[1]}, [r12]
|
||||||
|
|
||||||
vpush {q4-q7}
|
vpush {q4-q7}
|
||||||
|
vmov.f32 q5, q0 // bias
|
||||||
|
vdup.f32 q4, d2[0] // min
|
||||||
|
vdup.f32 q6, d2[1] // max
|
||||||
|
|
||||||
mov r12, #4
|
mov r12, #4
|
||||||
mul r4, r12, r4
|
mul r4, r12, r4
|
||||||
|
@ -59,14 +67,14 @@ mov r12, #8
|
||||||
mul r12, r4, r12
|
mul r12, r4, r12
|
||||||
|
|
||||||
L8Loop:
|
L8Loop:
|
||||||
vmov.i32 q8, #0
|
vmov.f32 q8, q5 // use bias to init
|
||||||
vmov.i32 q9, #0
|
vmov.f32 q9, q5
|
||||||
vmov.i32 q10, #0
|
vmov.f32 q10, q5
|
||||||
vmov.i32 q11, #0
|
vmov.f32 q11, q5
|
||||||
vmov.i32 q12, #0
|
vmov.f32 q12, q5
|
||||||
vmov.i32 q13, #0
|
vmov.f32 q13, q5
|
||||||
vmov.i32 q14, #0
|
vmov.f32 q14, q5
|
||||||
vmov.i32 q15, #0
|
vmov.f32 q15, q5
|
||||||
|
|
||||||
vmov.i32 d14[0], r1
|
vmov.i32 d14[0], r1
|
||||||
vmov.i32 d14[1], r2
|
vmov.i32 d14[1], r2
|
||||||
|
@ -103,6 +111,22 @@ L8Loop:
|
||||||
bne L8LoopH
|
bne L8LoopH
|
||||||
|
|
||||||
sub r3, r3, #8
|
sub r3, r3, #8
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmax.f32 q12, q12, q4
|
||||||
|
vmax.f32 q13, q13, q4
|
||||||
|
vmax.f32 q14, q14, q4
|
||||||
|
vmax.f32 q15, q15, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
|
vmin.f32 q12, q12, q6
|
||||||
|
vmin.f32 q13, q13, q6
|
||||||
|
vmin.f32 q14, q14, q6
|
||||||
|
vmin.f32 q15, q15, q6
|
||||||
vst1.32 {q8, q9}, [r0]!
|
vst1.32 {q8, q9}, [r0]!
|
||||||
vmov.i32 r1, d14[0]
|
vmov.i32 r1, d14[0]
|
||||||
vmov.i32 r2, d14[1]
|
vmov.i32 r2, d14[1]
|
||||||
|
@ -121,13 +145,13 @@ mov r12, #4
|
||||||
mul r12, r4, r12
|
mul r12, r4, r12
|
||||||
|
|
||||||
L4Loop:
|
L4Loop:
|
||||||
vmov.i32 q8, #0
|
vmov.f32 q8, q5
|
||||||
vmov.i32 q9, #0
|
vmov.f32 q9, q5
|
||||||
vmov.i32 q10, #0
|
vmov.f32 q10, q5
|
||||||
vmov.i32 q11, #0
|
vmov.f32 q11, q5
|
||||||
|
|
||||||
vmov.i32 d8[0], r1
|
vmov.i32 d14[0], r1
|
||||||
vmov.i32 d9[0], r2
|
vmov.i32 d14[1], r2
|
||||||
mov lr, r6
|
mov lr, r6
|
||||||
L4LoopH:
|
L4LoopH:
|
||||||
mov r10, r5
|
mov r10, r5
|
||||||
|
@ -151,10 +175,18 @@ L4Loop:
|
||||||
add r1, r1, r8
|
add r1, r1, r8
|
||||||
bne L4LoopH
|
bne L4LoopH
|
||||||
|
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
sub r3, r3, #4
|
sub r3, r3, #4
|
||||||
vst1.32 {q8, q9}, [r0]!
|
vst1.32 {q8, q9}, [r0]!
|
||||||
vmov.i32 r1, d8[0]
|
vmov.i32 r1, d14[0]
|
||||||
vmov.i32 r2, d9[0]
|
vmov.i32 r2, d14[1]
|
||||||
vst1.32 {q10, q11}, [r0]!
|
vst1.32 {q10, q11}, [r0]!
|
||||||
add r1, r1, r12
|
add r1, r1, r12
|
||||||
cmp r3, #4
|
cmp r3, #4
|
||||||
|
@ -168,7 +200,7 @@ cmp r3, #0
|
||||||
beq End
|
beq End
|
||||||
|
|
||||||
L1Loop:
|
L1Loop:
|
||||||
vmov.i32 q0, #0
|
vmov.f32 q0, q5
|
||||||
mov lr, r6
|
mov lr, r6
|
||||||
mov r11, r1
|
mov r11, r1
|
||||||
mov r12, r2
|
mov r12, r2
|
||||||
|
@ -184,6 +216,8 @@ L1Loop:
|
||||||
add r1, r1, r8
|
add r1, r1, r8
|
||||||
bne L1LoopH
|
bne L1LoopH
|
||||||
|
|
||||||
|
vmax.f32 q0, q0, q4
|
||||||
|
vmin.f32 q0, q0, q6
|
||||||
subs r3, r3, #1
|
subs r3, r3, #1
|
||||||
vst1.32 {q0}, [r0]!
|
vst1.32 {q0}, [r0]!
|
||||||
mov r2, r12
|
mov r2, r12
|
||||||
|
@ -203,6 +237,5 @@ bne LoopDY
|
||||||
vpop {q4-q7}
|
vpop {q4-q7}
|
||||||
pop {r4-r8, r10, r11, pc}
|
pop {r4-r8, r10, r11, pc}
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,74 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvRunForUnitDepthWise.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/02/04.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __arm__
|
|
||||||
#ifndef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvRunForUnitDepthWise
|
|
||||||
//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
|
|
||||||
|
|
||||||
//Auto: r0:dst, r1:src, r2:weight, r3:fw
|
|
||||||
|
|
||||||
push {r4-r8, lr}
|
|
||||||
|
|
||||||
//Load from sp:
|
|
||||||
//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
|
|
||||||
mov r4, r3
|
|
||||||
ldr r5, [sp, #24]
|
|
||||||
ldr r6, [sp, #28]
|
|
||||||
ldr r7, [sp, #32]
|
|
||||||
ldr r8, [sp, #36]
|
|
||||||
|
|
||||||
cmp r4, #0
|
|
||||||
vmov.i32 q0, #0
|
|
||||||
beq UnitEnd
|
|
||||||
cmp r5, #0
|
|
||||||
beq UnitEnd
|
|
||||||
|
|
||||||
mov lr, #4
|
|
||||||
mul r6, lr, r6
|
|
||||||
mul r7, lr, r7
|
|
||||||
mul r8, lr, r8
|
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
|
|
||||||
mul lr, r4, r7
|
|
||||||
sub r8, r8, lr
|
|
||||||
|
|
||||||
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
|
|
||||||
mov lr, #16
|
|
||||||
mul lr, r4, lr
|
|
||||||
sub r6, r6, lr
|
|
||||||
|
|
||||||
|
|
||||||
UnitLoopH:
|
|
||||||
mov lr, r4
|
|
||||||
UnitLoopW:
|
|
||||||
vld1.32 {q1}, [r1], r7
|
|
||||||
vld1.32 {q2}, [r2]!
|
|
||||||
vmla.f32 q0, q1, q2
|
|
||||||
subs lr, lr, #1
|
|
||||||
bne UnitLoopW
|
|
||||||
subs r5, r5, #1
|
|
||||||
add r1, r1, r8
|
|
||||||
add r2, r2, r6
|
|
||||||
bne UnitLoopH
|
|
||||||
|
|
||||||
|
|
||||||
UnitEnd:
|
|
||||||
|
|
||||||
vst1.32 {q0}, [r0]
|
|
||||||
|
|
||||||
pop {r4-r8, pc}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
|
|
@ -0,0 +1,221 @@
|
||||||
|
//
|
||||||
|
// MNNDepthwiseConvFastKernel.S
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2019/02/04.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifdef __arm__
|
||||||
|
#ifndef __aarch64__
|
||||||
|
|
||||||
|
#include "MNNAsmGlobal.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
asm_function MNNDepthwiseConvFastKernel
|
||||||
|
//void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
|
||||||
|
|
||||||
|
|
||||||
|
//Auto Load:
|
||||||
|
//r0:dst, r1:src, r2:weight, r3:width
|
||||||
|
|
||||||
|
push {r4-r8, r10, r11, lr}
|
||||||
|
|
||||||
|
//Load From Sp
|
||||||
|
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
|
||||||
|
ldr r4, [sp, #32]
|
||||||
|
ldr r5, [sp, #36]
|
||||||
|
ldr r6, [sp, #40]
|
||||||
|
ldr r7, [sp, #44]
|
||||||
|
ldr r8, [sp, #48]
|
||||||
|
ldr lr, [sp, #52]
|
||||||
|
ldr r10, [sp, #56]
|
||||||
|
ldr r11, [sp, #60]
|
||||||
|
ldr r12, [sp, #64] // bias
|
||||||
|
vld1.32 {q0}, [r12] // bias
|
||||||
|
ldr r12, [sp, #68] // min,max
|
||||||
|
vld1.32 {d2[0]}, [r12]!
|
||||||
|
vld1.32 {d2[1]}, [r12]
|
||||||
|
|
||||||
|
vpush {q4-q7}
|
||||||
|
vmov.f32 q5, q0 // bias
|
||||||
|
vdup.f32 q4, d2[0] // min
|
||||||
|
vdup.f32 q6, d2[1] // max
|
||||||
|
|
||||||
|
mov r12, #4
|
||||||
|
mul r4, r12, r4
|
||||||
|
mul r7, r12, r7
|
||||||
|
mul r8, r12, r8
|
||||||
|
mul r10, r12, r10
|
||||||
|
mul r11, r12, r11
|
||||||
|
|
||||||
|
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
||||||
|
mul r12, r5, r7
|
||||||
|
sub r8, r8, r12
|
||||||
|
|
||||||
|
LoopDY:
|
||||||
|
push {r0, r1, r3, r10, r11, lr}
|
||||||
|
|
||||||
|
L8:
|
||||||
|
cmp r3, #7
|
||||||
|
ble L4
|
||||||
|
|
||||||
|
L8Loop:
|
||||||
|
vmov.f32 q8, q5 // use bias to init
|
||||||
|
vmov.f32 q9, q5
|
||||||
|
vmov.f32 q10, q5
|
||||||
|
vmov.f32 q11, q5
|
||||||
|
vmov.f32 q12, q5
|
||||||
|
vmov.f32 q13, q5
|
||||||
|
vmov.f32 q14, q5
|
||||||
|
vmov.f32 q15, q5
|
||||||
|
|
||||||
|
mov r12, r1
|
||||||
|
mov r4, r2
|
||||||
|
mov lr, r6
|
||||||
|
L8LoopH:
|
||||||
|
mov r10, r5
|
||||||
|
L8LoopW:
|
||||||
|
vld1.32 {q7}, [r2]!
|
||||||
|
vld1.32 {q0, q1}, [r1]!
|
||||||
|
vld1.32 {q2, q3}, [r1]!
|
||||||
|
subs r10, r10, #1
|
||||||
|
vmla.f32 q8, q0, q7
|
||||||
|
vmla.f32 q9, q1, q7
|
||||||
|
vmla.f32 q10, q2, q7
|
||||||
|
vmla.f32 q11, q3, q7
|
||||||
|
vld1.32 {q0, q1}, [r1]!
|
||||||
|
vld1.32 {q2, q3}, [r1]
|
||||||
|
vmla.f32 q12, q0, q7
|
||||||
|
vmla.f32 q13, q1, q7
|
||||||
|
vmla.f32 q14, q2, q7
|
||||||
|
vmla.f32 q15, q3, q7
|
||||||
|
sub r1, r1, #80
|
||||||
|
|
||||||
|
bne L8LoopW
|
||||||
|
L8LoopWEnd:
|
||||||
|
subs lr, lr, #1
|
||||||
|
add r1, r1, r8
|
||||||
|
bne L8LoopH
|
||||||
|
|
||||||
|
sub r3, r3, #8
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmax.f32 q12, q12, q4
|
||||||
|
vmax.f32 q13, q13, q4
|
||||||
|
vmax.f32 q14, q14, q4
|
||||||
|
vmax.f32 q15, q15, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
|
vmin.f32 q12, q12, q6
|
||||||
|
vmin.f32 q13, q13, q6
|
||||||
|
vmin.f32 q14, q14, q6
|
||||||
|
vmin.f32 q15, q15, q6
|
||||||
|
vst1.32 {q8, q9}, [r0]!
|
||||||
|
mov r1, r12
|
||||||
|
mov r2, r4
|
||||||
|
vst1.32 {q10, q11}, [r0]!
|
||||||
|
vst1.32 {q12, q13}, [r0]!
|
||||||
|
vst1.32 {q14, q15}, [r0]!
|
||||||
|
add r1, r1, #128
|
||||||
|
cmp r3, #8
|
||||||
|
bge L8Loop
|
||||||
|
|
||||||
|
L4:
|
||||||
|
cmp r3, #3
|
||||||
|
ble L1
|
||||||
|
|
||||||
|
L4Loop:
|
||||||
|
vmov.f32 q8, q5
|
||||||
|
vmov.f32 q9, q5
|
||||||
|
vmov.f32 q10, q5
|
||||||
|
vmov.f32 q11, q5
|
||||||
|
|
||||||
|
mov r12, r1
|
||||||
|
mov r4, r2
|
||||||
|
mov lr, r6
|
||||||
|
L4LoopH:
|
||||||
|
mov r10, r5
|
||||||
|
L4LoopW:
|
||||||
|
vld1.32 {q12}, [r2]!
|
||||||
|
vld1.32 {q0, q1}, [r1]!
|
||||||
|
vld1.32 {q2, q3}, [r1]
|
||||||
|
sub r1, r1, #16
|
||||||
|
subs r10, r10, #1
|
||||||
|
vmla.f32 q8, q12, q0
|
||||||
|
vmla.f32 q9, q12, q1
|
||||||
|
vmla.f32 q10, q12, q2
|
||||||
|
vmla.f32 q11, q12, q3
|
||||||
|
|
||||||
|
bne L4LoopW
|
||||||
|
subs lr, lr, #1
|
||||||
|
add r1, r1, r8
|
||||||
|
bne L4LoopH
|
||||||
|
|
||||||
|
vmax.f32 q8, q8, q4
|
||||||
|
vmax.f32 q9, q9, q4
|
||||||
|
vmax.f32 q10, q10, q4
|
||||||
|
vmax.f32 q11, q11, q4
|
||||||
|
vmin.f32 q8, q8, q6
|
||||||
|
vmin.f32 q9, q9, q6
|
||||||
|
vmin.f32 q10, q10, q6
|
||||||
|
vmin.f32 q11, q11, q6
|
||||||
|
sub r3, r3, #4
|
||||||
|
vst1.32 {q8, q9}, [r0]!
|
||||||
|
mov r1, r12
|
||||||
|
mov r2, r4
|
||||||
|
vst1.32 {q10, q11}, [r0]!
|
||||||
|
add r1, r1, #64
|
||||||
|
cmp r3, #4
|
||||||
|
bge L4Loop
|
||||||
|
|
||||||
|
L1:
|
||||||
|
cmp r3, #0
|
||||||
|
beq End
|
||||||
|
L1Loop:
|
||||||
|
vmov.f32 q0, q5
|
||||||
|
mov lr, r6
|
||||||
|
mov r11, r1
|
||||||
|
mov r12, r2
|
||||||
|
L1LoopH:
|
||||||
|
mov r10, r5
|
||||||
|
L1LoopW:
|
||||||
|
vld1.32 {q1}, [r1]!
|
||||||
|
vld1.32 {q2}, [r2]!
|
||||||
|
vmla.f32 q0, q1, q2
|
||||||
|
subs r10, r10, #1
|
||||||
|
bne L1LoopW
|
||||||
|
subs lr, lr, #1
|
||||||
|
add r1, r1, r8
|
||||||
|
bne L1LoopH
|
||||||
|
|
||||||
|
vmax.f32 q0, q0, q4
|
||||||
|
vmin.f32 q0, q0, q6
|
||||||
|
subs r3, r3, #1
|
||||||
|
vst1.32 {q0}, [r0]!
|
||||||
|
mov r2, r12
|
||||||
|
add r1, r11, #16
|
||||||
|
bne L1Loop
|
||||||
|
|
||||||
|
|
||||||
|
End:
|
||||||
|
|
||||||
|
pop {r0, r1, r3, r10, r11, lr}
|
||||||
|
add r0, r0, r11
|
||||||
|
subs lr, lr, #1
|
||||||
|
add r1, r1, r10
|
||||||
|
bne LoopDY
|
||||||
|
|
||||||
|
vpop {q4-q7}
|
||||||
|
pop {r4-r8, r10, r11, pc}
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
|
@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max
|
||||||
str r12, [sp, #16]
|
str r12, [sp, #16]
|
||||||
ldr r12, [r6, #12] // int8 min
|
ldr r12, [r6, #12] // int8 min
|
||||||
str r12, [sp, #20]
|
str r12, [sp, #20]
|
||||||
ldr r12, [r6, #40] // blockNum
|
lsl r12, r3, #6 // weight_stride = src_depth_quad*LP*HP
|
||||||
mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum
|
|
||||||
lsl r12, r12, #6 // weight_stride = src_depth_quad*LP*HP
|
|
||||||
str r12, [sp, #24]
|
str r12, [sp, #24]
|
||||||
ldr r12, [r6, #48] // extraScale
|
ldr r12, [r6, #48] // extraScale
|
||||||
str r12, [sp, #28]
|
str r12, [sp, #28]
|
||||||
|
|
|
@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias
|
||||||
str r12, [sp, #8]
|
str r12, [sp, #8]
|
||||||
ldr r12, [r6, #36] // f32minmax
|
ldr r12, [r6, #36] // f32minmax
|
||||||
str r12, [sp, #12]
|
str r12, [sp, #12]
|
||||||
ldr r12, [r6, #40] // blockNum
|
lsl r12, r3, #5 // weight_stride = src_depth_quad*LP*HP
|
||||||
mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum
|
|
||||||
lsl r12, r12, #5 // weight_stride = src_depth_quad*LP*HP
|
|
||||||
str r12, [sp, #16]
|
str r12, [sp, #16]
|
||||||
ldr r12, [r6, #48] // extraScale
|
ldr r12, [r6, #48] // extraScale
|
||||||
str r12, [sp, #20]
|
str r12, [sp, #20]
|
||||||
|
@ -82,12 +80,14 @@ L2LoopDz:
|
||||||
subs r12, r3, #1
|
subs r12, r3, #1
|
||||||
// first four output
|
// first four output
|
||||||
vld1.8 {q2}, [r1]!
|
vld1.8 {q2}, [r1]!
|
||||||
vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11
|
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
|
||||||
// int4->int8
|
// int4->int8
|
||||||
vmov.i8 q5, #15
|
vmov.i8 q6, #15
|
||||||
vand.i8 q5, q5, q4
|
vmov.i8 q7, #15
|
||||||
|
vand.i8 q6, q6, q4
|
||||||
|
vand.i8 q7, q7, q5
|
||||||
vshr.u8 q4, q4, #4
|
vshr.u8 q4, q4, #4
|
||||||
vzip.8 q4, q5
|
vshr.u8 q5, q5, #4
|
||||||
|
|
||||||
vmull.s8 q0, d4, d8
|
vmull.s8 q0, d4, d8
|
||||||
vmull.s8 q1, d4, d10
|
vmull.s8 q1, d4, d10
|
||||||
|
@ -95,12 +95,6 @@ L2LoopDz:
|
||||||
vmlal.s8 q1, d5, d11
|
vmlal.s8 q1, d5, d11
|
||||||
vpaddl.s16 q8, q0
|
vpaddl.s16 q8, q0
|
||||||
vpaddl.s16 q9, q1
|
vpaddl.s16 q9, q1
|
||||||
vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15
|
|
||||||
// int4->int8
|
|
||||||
vmov.i8 q7, #15
|
|
||||||
vand.i8 q7, q7, q6
|
|
||||||
vshr.u8 q6, q6, #4
|
|
||||||
vzip.8 q6, q7
|
|
||||||
|
|
||||||
vmull.s8 q0, d4, d12
|
vmull.s8 q0, d4, d12
|
||||||
vmull.s8 q1, d4, d14
|
vmull.s8 q1, d4, d14
|
||||||
|
@ -129,22 +123,18 @@ L2LoopDz:
|
||||||
L2LoopSz:
|
L2LoopSz:
|
||||||
// first four output
|
// first four output
|
||||||
vld1.8 {q2}, [r1]!
|
vld1.8 {q2}, [r1]!
|
||||||
vld1.8 {q4}, [r2]!
|
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
|
||||||
// int4->int8
|
// int4->int8
|
||||||
vmov.i8 q5, #15
|
vmov.i8 q6, #15
|
||||||
vand.i8 q5, q5, q4
|
vmov.i8 q7, #15
|
||||||
|
vand.i8 q6, q6, q4
|
||||||
|
vand.i8 q7, q7, q5
|
||||||
vshr.u8 q4, q4, #4
|
vshr.u8 q4, q4, #4
|
||||||
vzip.8 q4, q5
|
vshr.u8 q5, q5, #4
|
||||||
vmull.s8 q0, d4, d8
|
vmull.s8 q0, d4, d8
|
||||||
vmull.s8 q1, d4, d10
|
vmull.s8 q1, d4, d10
|
||||||
vmlal.s8 q0, d5, d9
|
vmlal.s8 q0, d5, d9
|
||||||
vmlal.s8 q1, d5, d11
|
vmlal.s8 q1, d5, d11
|
||||||
vld1.8 {q6}, [r2]!
|
|
||||||
// int4->int8
|
|
||||||
vmov.i8 q7, #15
|
|
||||||
vand.i8 q7, q7, q6
|
|
||||||
vshr.u8 q6, q6, #4
|
|
||||||
vzip.8 q6, q7
|
|
||||||
vpadal.s16 q8, q0
|
vpadal.s16 q8, q0
|
||||||
vpadal.s16 q9, q1
|
vpadal.s16 q9, q1
|
||||||
|
|
||||||
|
@ -269,12 +259,14 @@ L1LoopDz:
|
||||||
subs r12, r3, #1
|
subs r12, r3, #1
|
||||||
// first four output
|
// first four output
|
||||||
vld1.8 {q2}, [r1]!
|
vld1.8 {q2}, [r1]!
|
||||||
vld1.8 {q4}, [r2]!
|
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
|
||||||
// int4->int8
|
// int4->int8
|
||||||
vmov.i8 q5, #15
|
vmov.i8 q6, #15
|
||||||
vand.i8 q5, q5, q4
|
vmov.i8 q7, #15
|
||||||
|
vand.i8 q6, q6, q4
|
||||||
|
vand.i8 q7, q7, q5
|
||||||
vshr.u8 q4, q4, #4
|
vshr.u8 q4, q4, #4
|
||||||
vzip.8 q4, q5
|
vshr.u8 q5, q5, #4
|
||||||
|
|
||||||
vmull.s8 q0, d4, d8
|
vmull.s8 q0, d4, d8
|
||||||
vmull.s8 q1, d4, d10
|
vmull.s8 q1, d4, d10
|
||||||
|
@ -282,12 +274,6 @@ L1LoopDz:
|
||||||
vmlal.s8 q1, d5, d11
|
vmlal.s8 q1, d5, d11
|
||||||
vpaddl.s16 q8, q0
|
vpaddl.s16 q8, q0
|
||||||
vpaddl.s16 q9, q1
|
vpaddl.s16 q9, q1
|
||||||
vld1.8 {q6}, [r2]!
|
|
||||||
// int4->int8
|
|
||||||
vmov.i8 q7, #15
|
|
||||||
vand.i8 q7, q7, q6
|
|
||||||
vshr.u8 q6, q6, #4
|
|
||||||
vzip.8 q6, q7
|
|
||||||
|
|
||||||
vmull.s8 q0, d4, d12
|
vmull.s8 q0, d4, d12
|
||||||
vmull.s8 q1, d4, d14
|
vmull.s8 q1, d4, d14
|
||||||
|
@ -302,22 +288,18 @@ L1LoopDz:
|
||||||
L1LoopSz:
|
L1LoopSz:
|
||||||
// first four output
|
// first four output
|
||||||
vld1.8 {q2}, [r1]!
|
vld1.8 {q2}, [r1]!
|
||||||
vld1.8 {q4}, [r2]!
|
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
|
||||||
// int4->int8
|
// int4->int8
|
||||||
vmov.i8 q5, #15
|
vmov.i8 q6, #15
|
||||||
vand.i8 q5, q5, q4
|
vmov.i8 q7, #15
|
||||||
|
vand.i8 q6, q6, q4
|
||||||
|
vand.i8 q7, q7, q5
|
||||||
vshr.u8 q4, q4, #4
|
vshr.u8 q4, q4, #4
|
||||||
vzip.8 q4, q5
|
vshr.u8 q5, q5, #4
|
||||||
vmull.s8 q0, d4, d8
|
vmull.s8 q0, d4, d8
|
||||||
vmull.s8 q1, d4, d10
|
vmull.s8 q1, d4, d10
|
||||||
vmlal.s8 q0, d5, d9
|
vmlal.s8 q0, d5, d9
|
||||||
vmlal.s8 q1, d5, d11
|
vmlal.s8 q1, d5, d11
|
||||||
vld1.8 {q6}, [r2]!
|
|
||||||
// int4->int8
|
|
||||||
vmov.i8 q7, #15
|
|
||||||
vand.i8 q7, q7, q6
|
|
||||||
vshr.u8 q6, q6, #4
|
|
||||||
vzip.8 q6, q7
|
|
||||||
vpadal.s16 q8, q0
|
vpadal.s16 q8, q0
|
||||||
vpadal.s16 q9, q1
|
vpadal.s16 q9, q1
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,12 @@ ldr x8, [sp, #0]
|
||||||
ldr x15, [sp, #8]
|
ldr x15, [sp, #8]
|
||||||
ldr x10, [sp, #16]
|
ldr x10, [sp, #16]
|
||||||
ldr x11, [sp, #24]
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
ldr x13, [sp, #40]
|
||||||
|
|
||||||
|
stp d8, d9, [sp, #(-16 * 3)]!
|
||||||
|
stp d10, d11, [sp, #(16 * 2)]
|
||||||
|
stp x19, x20, [sp, #(16 * 1)]
|
||||||
|
|
||||||
mov x9, #4
|
mov x9, #4
|
||||||
mul x4, x9, x4
|
mul x4, x9, x4
|
||||||
|
@ -34,10 +40,32 @@ mul x8, x9, x8
|
||||||
mul x10, x9, x10
|
mul x10, x9, x10
|
||||||
mul x11, x9, x11
|
mul x11, x9, x11
|
||||||
|
|
||||||
|
ld1 {v8.4s}, [x12] // bias
|
||||||
|
ld1r {v10.4s}, [x13], #4 // min
|
||||||
|
ld1r {v11.4s}, [x13]
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
||||||
mul x9, x5, x7
|
mul x9, x5, x7
|
||||||
sub x8, x8, x9
|
sub x8, x8, x9
|
||||||
|
|
||||||
|
.macro assign_bias x0, x1, x2, x3
|
||||||
|
mov \x0\().16b, v8.16b
|
||||||
|
mov \x1\().16b, v8.16b
|
||||||
|
mov \x2\().16b, v8.16b
|
||||||
|
mov \x3\().16b, v8.16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
|
||||||
|
fmax \x0\().4s, \x0\().4s, \xmin\().4s
|
||||||
|
fmax \x1\().4s, \x1\().4s, \xmin\().4s
|
||||||
|
fmax \x2\().4s, \x2\().4s, \xmin\().4s
|
||||||
|
fmax \x3\().4s, \x3\().4s, \xmin\().4s
|
||||||
|
fmin \x0\().4s, \x0\().4s, \xmax\().4s
|
||||||
|
fmin \x1\().4s, \x1\().4s, \xmax\().4s
|
||||||
|
fmin \x2\().4s, \x2\().4s, \xmax\().4s
|
||||||
|
fmin \x3\().4s, \x3\().4s, \xmax\().4s
|
||||||
|
.endm
|
||||||
|
|
||||||
LoopDY:
|
LoopDY:
|
||||||
mov v4.d[0], x10
|
mov v4.d[0], x10
|
||||||
mov v4.d[1], x11
|
mov v4.d[1], x11
|
||||||
|
@ -53,22 +81,10 @@ mov x12, #16
|
||||||
mul x12, x4, x12
|
mul x12, x4, x12
|
||||||
|
|
||||||
L16Loop:
|
L16Loop:
|
||||||
movi v16.4s, #0
|
assign_bias v16, v17, v18, v19
|
||||||
movi v17.4s, #0
|
assign_bias v20, v21, v22, v23
|
||||||
movi v18.4s, #0
|
assign_bias v24, v25, v26, v27
|
||||||
movi v19.4s, #0
|
assign_bias v28, v29, v30, v31
|
||||||
movi v20.4s, #0
|
|
||||||
movi v21.4s, #0
|
|
||||||
movi v22.4s, #0
|
|
||||||
movi v23.4s, #0
|
|
||||||
movi v24.4s, #0
|
|
||||||
movi v25.4s, #0
|
|
||||||
movi v26.4s, #0
|
|
||||||
movi v27.4s, #0
|
|
||||||
movi v28.4s, #0
|
|
||||||
movi v29.4s, #0
|
|
||||||
movi v30.4s, #0
|
|
||||||
movi v31.4s, #0
|
|
||||||
|
|
||||||
mov x13, x1
|
mov x13, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
|
@ -120,6 +136,10 @@ L16Loop:
|
||||||
bne L16LoopH
|
bne L16LoopH
|
||||||
|
|
||||||
sub x3, x3, #16
|
sub x3, x3, #16
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
compare_min_max v24, v25, v26, v27, v10, v11
|
||||||
|
compare_min_max v28, v29, v30, v31, v10, v11
|
||||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x13, x12
|
||||||
cmp x3, #16
|
cmp x3, #16
|
||||||
|
@ -138,14 +158,8 @@ mov x12, #8
|
||||||
mul x12, x4, x12
|
mul x12, x4, x12
|
||||||
|
|
||||||
L8Loop:
|
L8Loop:
|
||||||
movi v16.4s, #0
|
assign_bias v16, v17, v18, v19
|
||||||
movi v17.4s, #0
|
assign_bias v20, v21, v22, v23
|
||||||
movi v18.4s, #0
|
|
||||||
movi v19.4s, #0
|
|
||||||
movi v20.4s, #0
|
|
||||||
movi v21.4s, #0
|
|
||||||
movi v22.4s, #0
|
|
||||||
movi v23.4s, #0
|
|
||||||
|
|
||||||
mov x13, x1
|
mov x13, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
|
@ -180,6 +194,8 @@ L8Loop:
|
||||||
add x1, x1, x8
|
add x1, x1, x8
|
||||||
bne L8LoopH
|
bne L8LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
sub x3, x3, #8
|
sub x3, x3, #8
|
||||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x13, x12
|
||||||
|
@ -195,10 +211,7 @@ mov x12, #4
|
||||||
mul x12, x4, x12
|
mul x12, x4, x12
|
||||||
|
|
||||||
L4Loop:
|
L4Loop:
|
||||||
movi v16.4s, #0
|
assign_bias v16, v17, v18, v19
|
||||||
movi v17.4s, #0
|
|
||||||
movi v18.4s, #0
|
|
||||||
movi v19.4s, #0
|
|
||||||
|
|
||||||
mov x13, x1
|
mov x13, x1
|
||||||
mov x14, x2
|
mov x14, x2
|
||||||
|
@ -225,6 +238,7 @@ L4Loop:
|
||||||
add x1, x1, x8
|
add x1, x1, x8
|
||||||
bne L4LoopH
|
bne L4LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
sub x3, x3, #4
|
sub x3, x3, #4
|
||||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
add x1, x13, x12
|
add x1, x13, x12
|
||||||
|
@ -235,7 +249,7 @@ cmp x3, #0
|
||||||
beq End
|
beq End
|
||||||
|
|
||||||
L1Loop:
|
L1Loop:
|
||||||
movi v0.4s, #0
|
mov v0.16b, v8.16b
|
||||||
mov x9, x6
|
mov x9, x6
|
||||||
mov x11, x1
|
mov x11, x1
|
||||||
mov x12, x2
|
mov x12, x2
|
||||||
|
@ -252,6 +266,8 @@ L1Loop:
|
||||||
bne L1LoopH
|
bne L1LoopH
|
||||||
|
|
||||||
subs x3, x3, #1
|
subs x3, x3, #1
|
||||||
|
fmax v0.4s, v0.4s, v10.4s
|
||||||
|
fmin v0.4s, v0.4s, v11.4s
|
||||||
st1 {v0.4s}, [x0], #16
|
st1 {v0.4s}, [x0], #16
|
||||||
mov x2, x12
|
mov x2, x12
|
||||||
add x1, x11, x4
|
add x1, x11, x4
|
||||||
|
@ -271,7 +287,9 @@ add x0, x0, x11
|
||||||
add x1, x1, x10
|
add x1, x1, x10
|
||||||
bne LoopDY
|
bne LoopDY
|
||||||
|
|
||||||
|
ldp x19, x20, [sp, #(16 * 1)]
|
||||||
|
ldp d10, d11, [sp, #(16 * 2)]
|
||||||
|
ldp d8, d9, [sp], #(16 * 3)
|
||||||
ret
|
ret
|
||||||
//MNNConvRunForLineDepthwise End
|
//MNNConvRunForLineDepthwise End
|
||||||
|
|
||||||
|
|
|
@ -1,63 +0,0 @@
|
||||||
//
|
|
||||||
// MNNConvRunForUnitDepthWise.S
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/02/04.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifdef __aarch64__
|
|
||||||
|
|
||||||
#include "MNNAsmGlobal.h"
|
|
||||||
|
|
||||||
.text
|
|
||||||
.align 5
|
|
||||||
|
|
||||||
asm_function MNNConvRunForUnitDepthWise
|
|
||||||
//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
|
|
||||||
|
|
||||||
//Auto: x0:dst, x1:src, x2:weight, x3:fw
|
|
||||||
//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
|
|
||||||
|
|
||||||
cmp x3, #0
|
|
||||||
movi v0.4s, #0
|
|
||||||
beq UnitEnd
|
|
||||||
cmp x4, #0
|
|
||||||
beq UnitEnd
|
|
||||||
|
|
||||||
mov x9, #4
|
|
||||||
mul x5, x9, x5
|
|
||||||
mul x6, x9, x6
|
|
||||||
mul x7, x9, x7
|
|
||||||
|
|
||||||
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
|
|
||||||
mul x9, x3, x6
|
|
||||||
sub x7, x7, x9
|
|
||||||
|
|
||||||
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
|
|
||||||
mov x9, #16
|
|
||||||
mul x9, x3, x9
|
|
||||||
sub x5, x5, x9
|
|
||||||
|
|
||||||
|
|
||||||
UnitLoopH:
|
|
||||||
mov x9, x3
|
|
||||||
UnitLoopW:
|
|
||||||
ld1 {v1.4s}, [x1], x6
|
|
||||||
ld1 {v2.4s}, [x2], #16
|
|
||||||
fmla v0.4s, v1.4s, v2.4s
|
|
||||||
subs x9, x9, #1
|
|
||||||
bne UnitLoopW
|
|
||||||
subs x4, x4, #1
|
|
||||||
add x1, x1, x7
|
|
||||||
add x2, x2, x5
|
|
||||||
bne UnitLoopH
|
|
||||||
|
|
||||||
|
|
||||||
UnitEnd:
|
|
||||||
|
|
||||||
st1 {v0.4s}, [x0]
|
|
||||||
|
|
||||||
ret
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -0,0 +1,292 @@
|
||||||
|
//
|
||||||
|
// MNNDepthwiseConvFastKernel.S
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2024/09/18.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __aarch64__
|
||||||
|
|
||||||
|
#include "MNNAsmGlobal.h"
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
asm_function MNNDepthwiseConvFastKernel
|
||||||
|
|
||||||
|
// void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
// size_t srcHStep, size_t dstHStep);
|
||||||
|
//Auto Load:
|
||||||
|
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
|
||||||
|
|
||||||
|
//Load From sp:
|
||||||
|
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
|
||||||
|
ldr x8, [sp, #0]
|
||||||
|
ldr x15, [sp, #8]
|
||||||
|
ldr x10, [sp, #16]
|
||||||
|
ldr x11, [sp, #24]
|
||||||
|
ldr x12, [sp, #32]
|
||||||
|
ldr x13, [sp, #40]
|
||||||
|
|
||||||
|
stp d14, d15, [sp, #(-16 * 9)]!
|
||||||
|
stp d12, d13, [sp, #(16 * 1)]
|
||||||
|
stp d10, d11, [sp, #(16 * 2)]
|
||||||
|
stp d8, d9, [sp, #(16 * 3)]
|
||||||
|
stp x21, x22, [sp, #(16 * 4)]
|
||||||
|
stp x19, x20, [sp, #(16 * 5)]
|
||||||
|
stp x27, x28, [sp, #(16 * 6)]
|
||||||
|
stp x25, x26, [sp, #(16 * 7)]
|
||||||
|
stp x23, x24, [sp, #(16 * 8)]
|
||||||
|
|
||||||
|
lsl x4, x4, #2 // src_w_step*sizeof(float)
|
||||||
|
lsl x7, x7, #2 // dilate_x_step*sizeof(float)
|
||||||
|
lsl x8, x8, #2 // dilate_y_step*sizeof(float)
|
||||||
|
lsl x23, x10, #2 // srcHStep*sizeof(float)
|
||||||
|
lsl x24, x11, #2 // dstHStep*sizeof(float)
|
||||||
|
mov x20, x12 // bias
|
||||||
|
mov x26, x13 // min
|
||||||
|
add x27, x13, #4 // max
|
||||||
|
|
||||||
|
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
|
||||||
|
mul x9, x5, x7
|
||||||
|
sub x8, x8, x9
|
||||||
|
mov x25, x3 // width
|
||||||
|
.macro assign_bias x0, x1, x2, x3, bv
|
||||||
|
mov \x0\().16b, \bv\().16b
|
||||||
|
mov \x1\().16b, \bv\().16b
|
||||||
|
mov \x2\().16b, \bv\().16b
|
||||||
|
mov \x3\().16b, \bv\().16b
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
|
||||||
|
fmax \x0\().4s, \x0\().4s, \xmin\().4s
|
||||||
|
fmax \x1\().4s, \x1\().4s, \xmin\().4s
|
||||||
|
fmax \x2\().4s, \x2\().4s, \xmin\().4s
|
||||||
|
fmax \x3\().4s, \x3\().4s, \xmin\().4s
|
||||||
|
fmin \x0\().4s, \x0\().4s, \xmax\().4s
|
||||||
|
fmin \x1\().4s, \x1\().4s, \xmax\().4s
|
||||||
|
fmin \x2\().4s, \x2\().4s, \xmax\().4s
|
||||||
|
fmin \x3\().4s, \x3\().4s, \xmax\().4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
LoopDY:
|
||||||
|
//mov x23, x10
|
||||||
|
//mov x24, x11
|
||||||
|
mov x21, x0
|
||||||
|
mov x22, x1
|
||||||
|
|
||||||
|
L16:
|
||||||
|
cmp x3, #16
|
||||||
|
blt L8
|
||||||
|
|
||||||
|
mov x12, #-176
|
||||||
|
mov x19, #256
|
||||||
|
|
||||||
|
L16Loop:
|
||||||
|
ld1 {v8.4s}, [x20] // load bias
|
||||||
|
assign_bias v16, v17, v18, v19, v8
|
||||||
|
assign_bias v20, v21, v22, v23, v8
|
||||||
|
assign_bias v24, v25, v26, v27, v8
|
||||||
|
assign_bias v28, v29, v30, v31, v8
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L16LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L16LoopW:
|
||||||
|
ld1 {v8.4s}, [x2], #16
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
|
||||||
|
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.4s, v8.4s, v0.4s
|
||||||
|
fmla v17.4s, v8.4s, v1.4s
|
||||||
|
fmla v18.4s, v8.4s, v2.4s
|
||||||
|
fmla v19.4s, v8.4s, v3.4s
|
||||||
|
|
||||||
|
fmla v20.4s, v8.4s, v4.4s
|
||||||
|
fmla v21.4s, v8.4s, v5.4s
|
||||||
|
fmla v22.4s, v8.4s, v6.4s
|
||||||
|
fmla v23.4s, v8.4s, v7.4s
|
||||||
|
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
|
||||||
|
|
||||||
|
fmla v24.4s, v8.4s, v9.4s
|
||||||
|
fmla v25.4s, v8.4s, v10.4s
|
||||||
|
fmla v26.4s, v8.4s, v11.4s
|
||||||
|
fmla v27.4s, v8.4s, v12.4s
|
||||||
|
|
||||||
|
fmla v28.4s, v8.4s, v0.4s
|
||||||
|
fmla v29.4s, v8.4s, v1.4s
|
||||||
|
fmla v30.4s, v8.4s, v2.4s
|
||||||
|
fmla v31.4s, v8.4s, v3.4s
|
||||||
|
|
||||||
|
bne L16LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L16LoopH
|
||||||
|
ld1r {v10.4s}, [x26] // min
|
||||||
|
ld1r {v11.4s}, [x27] // max
|
||||||
|
sub x3, x3, #16
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
compare_min_max v24, v25, v26, v27, v10, v11
|
||||||
|
compare_min_max v28, v29, v30, v31, v10, v11
|
||||||
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
|
add x1, x13, x19 // 16 * pack * sizeof(float)
|
||||||
|
cmp x3, #16
|
||||||
|
mov x2, x14
|
||||||
|
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
|
||||||
|
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
|
||||||
|
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
|
||||||
|
bge L16Loop
|
||||||
|
|
||||||
|
|
||||||
|
L8:
|
||||||
|
ld1r {v10.4s}, [x26] // min
|
||||||
|
ld1r {v11.4s}, [x27] // max
|
||||||
|
ld1 {v24.4s}, [x20] // load bias
|
||||||
|
cmp x3, #7
|
||||||
|
ble L4
|
||||||
|
|
||||||
|
mov x12, #-48
|
||||||
|
mov x19, #128
|
||||||
|
|
||||||
|
|
||||||
|
L8Loop:
|
||||||
|
assign_bias v16, v17, v18, v19, v24
|
||||||
|
assign_bias v20, v21, v22, v23, v24
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L8LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L8LoopW:
|
||||||
|
ld1 {v8.4s}, [x2], #16
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
|
||||||
|
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.4s, v8.4s, v0.4s
|
||||||
|
fmla v17.4s, v8.4s, v1.4s
|
||||||
|
fmla v18.4s, v8.4s, v2.4s
|
||||||
|
fmla v19.4s, v8.4s, v3.4s
|
||||||
|
|
||||||
|
fmla v20.4s, v8.4s, v4.4s
|
||||||
|
fmla v21.4s, v8.4s, v5.4s
|
||||||
|
fmla v22.4s, v8.4s, v6.4s
|
||||||
|
fmla v23.4s, v8.4s, v7.4s
|
||||||
|
|
||||||
|
bne L8LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L8LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
compare_min_max v20, v21, v22, v23, v10, v11
|
||||||
|
sub x3, x3, #8
|
||||||
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
|
add x1, x13, x19 // 8 * pack * sizeof(float)
|
||||||
|
mov x2, x14
|
||||||
|
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
|
||||||
|
|
||||||
|
|
||||||
|
L4:
|
||||||
|
cmp x3, #4
|
||||||
|
ble L1
|
||||||
|
|
||||||
|
mov x12, #16
|
||||||
|
mov x19, #64
|
||||||
|
|
||||||
|
L4Loop:
|
||||||
|
assign_bias v16, v17, v18, v19, v24
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L4LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L4LoopW:
|
||||||
|
ld1 {v8.4s}, [x2], #16
|
||||||
|
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.4s, v8.4s, v0.4s
|
||||||
|
fmla v17.4s, v8.4s, v1.4s
|
||||||
|
fmla v18.4s, v8.4s, v2.4s
|
||||||
|
fmla v19.4s, v8.4s, v3.4s
|
||||||
|
|
||||||
|
bne L4LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L4LoopH
|
||||||
|
|
||||||
|
compare_min_max v16, v17, v18, v19, v10, v11
|
||||||
|
sub x3, x3, #4
|
||||||
|
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
|
||||||
|
add x1, x13, x19
|
||||||
|
mov x2, x14
|
||||||
|
|
||||||
|
L1:
|
||||||
|
cmp x3, #0
|
||||||
|
beq End
|
||||||
|
|
||||||
|
mov x19, #16
|
||||||
|
|
||||||
|
L1Loop:
|
||||||
|
ld1 {v16.4s}, [x20] // assign bias
|
||||||
|
|
||||||
|
mov x13, x1
|
||||||
|
mov x14, x2
|
||||||
|
mov x9, x6
|
||||||
|
L1LoopH:
|
||||||
|
mov x10, x5
|
||||||
|
L1LoopW:
|
||||||
|
ld1 {v8.4s}, [x2], #16
|
||||||
|
ld1 {v0.4s}, [x1], #16
|
||||||
|
subs x10, x10, #1
|
||||||
|
fmla v16.4s, v8.4s, v0.4s
|
||||||
|
|
||||||
|
bne L1LoopW
|
||||||
|
subs x9, x9, #1
|
||||||
|
add x1, x1, x8
|
||||||
|
bne L1LoopH
|
||||||
|
|
||||||
|
subs x3, x3, #1
|
||||||
|
fmax v16.4s, v16.4s, v10.4s
|
||||||
|
fmin v16.4s, v16.4s, v11.4s
|
||||||
|
st1 {v16.4s}, [x0], #16
|
||||||
|
add x1, x13, x4
|
||||||
|
mov x2, x14
|
||||||
|
bne L1Loop
|
||||||
|
|
||||||
|
|
||||||
|
End:
|
||||||
|
|
||||||
|
//mov x10, x23
|
||||||
|
//mov x11, x24
|
||||||
|
//mov x0, x21
|
||||||
|
//mov x1, x22
|
||||||
|
mov x3, x25
|
||||||
|
|
||||||
|
subs x15, x15, #1
|
||||||
|
add x0, x21, x24
|
||||||
|
add x1, x22, x23
|
||||||
|
bne LoopDY
|
||||||
|
|
||||||
|
ldp x23, x24, [sp, #(16 * 8)]
|
||||||
|
ldp x25, x26, [sp, #(16 * 7)]
|
||||||
|
ldp x27, x28, [sp, #(16 * 6)]
|
||||||
|
ldp x19, x20, [sp, #(16 * 5)]
|
||||||
|
ldp x21, x22, [sp, #(16 * 4)]
|
||||||
|
ldp d8, d9, [sp, #(16 * 3)]
|
||||||
|
ldp d10, d11, [sp, #(16 * 2)]
|
||||||
|
ldp d12, d13, [sp, #(16 * 1)]
|
||||||
|
ldp d14, d15, [sp], #(16 * 9)
|
||||||
|
ret
|
||||||
|
//MNNConvRunForLineDepthwise End
|
||||||
|
|
||||||
|
#endif
|
|
@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)]
|
||||||
ldr x19, [x15, #56] // fp32 min max
|
ldr x19, [x15, #56] // fp32 min max
|
||||||
ldr x21, [x15, #64] // blockNum
|
ldr x21, [x15, #64] // blockNum
|
||||||
ldr x23, [x15, #80] // extraScale
|
ldr x23, [x15, #80] // extraScale
|
||||||
mul x21, x21, x3 // blockNum * src_depth_quad_perblock
|
lsl x21, x3, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
|
||||||
lsl x21, x21, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
|
|
||||||
add x20, x19, #4
|
add x20, x19, #4
|
||||||
|
|
||||||
Start:
|
Start:
|
||||||
|
|
|
@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)]
|
||||||
stp x25, x26, [sp, #(16 * 7)]
|
stp x25, x26, [sp, #(16 * 7)]
|
||||||
stp x23, x24, [sp, #(16 * 8)]
|
stp x23, x24, [sp, #(16 * 8)]
|
||||||
|
|
||||||
ldr x27, [x6, #64] // blockNum
|
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
|
||||||
mul x27, x27, x3 // blockNum * src_depth_quad_perblock
|
|
||||||
lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
|
|
||||||
|
|
||||||
ldr w28, [x6, #24] // useInt8
|
ldr w28, [x6, #24] // useInt8
|
||||||
ldr x25, [x6, #40] // xKernelSum
|
ldr x25, [x6, #40] // xKernelSum
|
||||||
|
|
|
@ -138,9 +138,7 @@ ldr w23, [x6, #24]
|
||||||
ldr x27, [x6, #40] // srcKernelSum
|
ldr x27, [x6, #40] // srcKernelSum
|
||||||
ldr x28, [x6, #48] // weightQuanBias
|
ldr x28, [x6, #48] // weightQuanBias
|
||||||
|
|
||||||
ldr x22, [x6, #64] // blockNum
|
lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
|
||||||
mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
|
|
||||||
lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
|
|
||||||
|
|
||||||
ldr x10, [x6, #80] // extra scale
|
ldr x10, [x6, #80] // extra scale
|
||||||
mov x21, #4 // sizeof(int8_t) * pack
|
mov x21, #4 // sizeof(int8_t) * pack
|
||||||
|
|
|
@ -55,8 +55,7 @@ mov x9, x6 // blockNum
|
||||||
|
|
||||||
cbnz x12, TILE10_BLOCK_NUM
|
cbnz x12, TILE10_BLOCK_NUM
|
||||||
ld1 {v5.4s, v6.4s}, [x2], #32
|
ld1 {v5.4s, v6.4s}, [x2], #32
|
||||||
ld1 {v7.d}[0], [x2]
|
ld1 {v7.d}[0], [x2], #8
|
||||||
sub x2, x2, #32
|
|
||||||
|
|
||||||
TILE10_BLOCK_NUM:
|
TILE10_BLOCK_NUM:
|
||||||
cbz x9, TILE10_END
|
cbz x9, TILE10_END
|
||||||
|
@ -315,4 +314,4 @@ ldp d10, d11, [sp, #(16 * 2)]
|
||||||
ldp d12, d13, [sp, #(16 * 1)]
|
ldp d12, d13, [sp, #(16 * 1)]
|
||||||
ldp d14, d15, [sp], #(16 * 4)
|
ldp d14, d15, [sp], #(16 * 4)
|
||||||
ret
|
ret
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)]
|
||||||
stp x23, x24, [sp, #(16 * 6)]
|
stp x23, x24, [sp, #(16 * 6)]
|
||||||
|
|
||||||
ldr x19, [x15, #56] // fp32 min max
|
ldr x19, [x15, #56] // fp32 min max
|
||||||
ldr x21, [x15, #64] // blockNum
|
|
||||||
ldr x23, [x15, #80] // extraScale
|
ldr x23, [x15, #80] // extraScale
|
||||||
mul x21, x21, x3 // blockNum * src_depth_quad_perblock
|
lsl x21, x3, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
|
||||||
lsl x21, x21, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
|
|
||||||
add x20, x19, #4
|
add x20, x19, #4
|
||||||
|
|
||||||
Start:
|
Start:
|
||||||
|
|
|
@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)]
|
||||||
stp x25, x26, [sp, #(16 * 7)]
|
stp x25, x26, [sp, #(16 * 7)]
|
||||||
stp x23, x24, [sp, #(16 * 8)]
|
stp x23, x24, [sp, #(16 * 8)]
|
||||||
|
|
||||||
ldr x27, [x6, #64] // blockNum
|
lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
|
||||||
mul x27, x27, x3 // blockNum * src_depth_quad_perblock
|
|
||||||
lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
|
|
||||||
|
|
||||||
ldr x25, [x6, #40] // xKernelSum
|
ldr x25, [x6, #40] // xKernelSum
|
||||||
ldr x26, [x6, #48] // weightQuantBias
|
ldr x26, [x6, #48] // weightQuantBias
|
||||||
|
|
|
@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)]
|
||||||
ldr x27, [x6, #40] // srcKernelSum
|
ldr x27, [x6, #40] // srcKernelSum
|
||||||
ldr x28, [x6, #48] // weightQuanBias
|
ldr x28, [x6, #48] // weightQuanBias
|
||||||
|
|
||||||
ldr x22, [x6, #64] // blockNum
|
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
|
||||||
mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
|
|
||||||
lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
|
|
||||||
|
|
||||||
mov x21, #16 // sizeof(float) * pack
|
mov x21, #16 // sizeof(float) * pack
|
||||||
ldr x14, [x6, #56] // float32 maxmin ptr
|
ldr x14, [x6, #56] // float32 maxmin ptr
|
||||||
|
|
|
@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
|
|
||||||
int unit = ow / 2;
|
|
||||||
MNN_ASSERT(cacheLineSize >= 1);
|
|
||||||
auto biasF = Vec4::load(bias);
|
|
||||||
auto minF = Vec4(parameters[2]);
|
|
||||||
auto maxF = Vec4(parameters[3]);
|
|
||||||
for (int x = 0; x < unit; ++x) {
|
|
||||||
auto offset = 4 * 4 * x;
|
|
||||||
int i = 0;
|
|
||||||
Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
||||||
Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
||||||
Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
||||||
Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
|
|
||||||
|
|
||||||
for (i = 1; i < cacheLineSize; ++i) {
|
|
||||||
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
||||||
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
||||||
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
||||||
m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
auto o1 = m1 - m2 + m3 + biasF;
|
|
||||||
o0 = Vec4::min(maxF, o0);
|
|
||||||
o1 = Vec4::min(maxF, o1);
|
|
||||||
o0 = Vec4::max(minF, o0);
|
|
||||||
o1 = Vec4::max(minF, o1);
|
|
||||||
Vec4::save(dest + 8 * x + 0 * 4, o0);
|
|
||||||
Vec4::save(dest + 8 * x + 1 * 4, o1);
|
|
||||||
}
|
|
||||||
if (unit * 2 < ow) {
|
|
||||||
auto offset = 4 * 4 * unit;
|
|
||||||
int i = 0;
|
|
||||||
Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
||||||
Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
||||||
Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
||||||
|
|
||||||
for (i = 1; i < cacheLineSize; ++i) {
|
|
||||||
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
|
|
||||||
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
|
|
||||||
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
|
|
||||||
}
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
o0 = Vec4::min(maxF, o0);
|
|
||||||
o0 = Vec4::max(minF, o0);
|
|
||||||
Vec4::save(dest + 8 * unit + 0 * 4, o0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
extern "C" {
|
|
||||||
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
|
|
||||||
for (int x = 0; x < su; ++x) {
|
|
||||||
auto dstX = dest + 4 * 4 * x;
|
|
||||||
auto sx = x * 2 - (int)pad;
|
|
||||||
auto ex = sx + 4;
|
|
||||||
|
|
||||||
auto clampSx = std::max(sx, 0);
|
|
||||||
auto clampEx = std::min(ex, (int)iw);
|
|
||||||
|
|
||||||
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
||||||
for (int i = clampSx; i < clampEx; ++i) {
|
|
||||||
v[i - sx] = Vec4::load(source + 4 * i);
|
|
||||||
}
|
|
||||||
auto m0 = v[0] - v[2];
|
|
||||||
auto m1 = v[1] + v[2];
|
|
||||||
auto m2 = v[2] - v[1];
|
|
||||||
auto m3 = v[3] - v[1];
|
|
||||||
|
|
||||||
Vec4::save(dstX + 4 * 0, m0);
|
|
||||||
Vec4::save(dstX + 4 * 1, m1);
|
|
||||||
Vec4::save(dstX + 4 * 2, m2);
|
|
||||||
Vec4::save(dstX + 4 * 3, m3);
|
|
||||||
}
|
|
||||||
MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
|
|
||||||
|
|
||||||
for (int x = eu; x < unit; ++x) {
|
|
||||||
auto dstX = dest + 4 * 4 * x;
|
|
||||||
auto sx = x * 2 - (int)pad;
|
|
||||||
auto ex = sx + 4;
|
|
||||||
|
|
||||||
auto clampSx = std::max(sx, 0);
|
|
||||||
auto clampEx = std::min(ex, (int)iw);
|
|
||||||
|
|
||||||
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
|
||||||
for (int i = clampSx; i < clampEx; ++i) {
|
|
||||||
v[i - sx] = Vec4::load(source + 4 * i);
|
|
||||||
}
|
|
||||||
auto m0 = v[0] - v[2];
|
|
||||||
auto m1 = v[1] + v[2];
|
|
||||||
auto m2 = v[2] - v[1];
|
|
||||||
auto m3 = v[3] - v[1];
|
|
||||||
|
|
||||||
Vec4::save(dstX + 4 * 0, m0);
|
|
||||||
Vec4::save(dstX + 4 * 1, m1);
|
|
||||||
Vec4::save(dstX + 4 * 2, m2);
|
|
||||||
Vec4::save(dstX + 4 * 3, m3);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef MNN_USE_NEON
|
|
||||||
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
|
|
||||||
int unit = ow / 2;
|
|
||||||
auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
|
|
||||||
auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
|
|
||||||
auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
|
|
||||||
auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
|
|
||||||
auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
|
|
||||||
auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
|
|
||||||
auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
|
|
||||||
auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
|
|
||||||
auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
|
|
||||||
auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
|
|
||||||
auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
|
|
||||||
auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
|
|
||||||
auto biasF = Vec4::load(bias);
|
|
||||||
auto minF = Vec4(parameters[2]);
|
|
||||||
auto maxF = Vec4(parameters[3]);
|
|
||||||
for (int x = 0; x < unit; ++x) {
|
|
||||||
auto offset = 4 * 4 * x;
|
|
||||||
int i = 0;
|
|
||||||
Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
|
|
||||||
Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
|
|
||||||
Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
|
|
||||||
Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
|
|
||||||
|
|
||||||
m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
|
|
||||||
m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
|
|
||||||
m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
|
|
||||||
m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
|
|
||||||
|
|
||||||
m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
|
|
||||||
m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
|
|
||||||
m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
|
|
||||||
m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
|
|
||||||
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
auto o1 = m1 - m2 + m3 + biasF;
|
|
||||||
o0 = Vec4::min(maxF, o0);
|
|
||||||
o1 = Vec4::min(maxF, o1);
|
|
||||||
o0 = Vec4::max(minF, o0);
|
|
||||||
o1 = Vec4::max(minF, o1);
|
|
||||||
Vec4::save(dest + 8 * x + 0 * 4, o0);
|
|
||||||
Vec4::save(dest + 8 * x + 1 * 4, o1);
|
|
||||||
}
|
|
||||||
if (unit * 2 < ow) {
|
|
||||||
auto offset = 4 * 4 * unit;
|
|
||||||
Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
|
|
||||||
Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
|
|
||||||
Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
|
|
||||||
|
|
||||||
m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
|
|
||||||
m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
|
|
||||||
m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
|
|
||||||
|
|
||||||
m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
|
|
||||||
m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
|
|
||||||
m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
|
|
||||||
auto o0 = m0 + m1 + m2 + biasF;
|
|
||||||
o0 = Vec4::min(maxF, o0);
|
|
||||||
o0 = Vec4::max(minF, o0);
|
|
||||||
Vec4::save(dest + 8 * unit + 0 * 4, o0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
|
|
||||||
if (unit <= 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Vec4 v0 = Vec4::load(source + 4 * 0);
|
|
||||||
Vec4 v1 = Vec4::load(source + 4 * 1);
|
|
||||||
Vec4 v2;
|
|
||||||
Vec4 v3;
|
|
||||||
source += 8;
|
|
||||||
|
|
||||||
for (int x = 0; x < unit; ++x) {
|
|
||||||
v2 = Vec4::load(source + 0 * 4);
|
|
||||||
v3 = Vec4::load(source + 1 * 4);
|
|
||||||
auto m0 = v0 - v2;
|
|
||||||
auto m1 = v1 + v2;
|
|
||||||
auto m2 = v2 - v1;
|
|
||||||
auto m3 = v3 - v1;
|
|
||||||
|
|
||||||
Vec4::save(dest + 4 * 0, m0);
|
|
||||||
Vec4::save(dest + 4 * 1, m1);
|
|
||||||
Vec4::save(dest + 4 * 2, m2);
|
|
||||||
Vec4::save(dest + 4 * 3, m3);
|
|
||||||
|
|
||||||
source += 8;
|
|
||||||
dest += 16;
|
|
||||||
|
|
||||||
v0 = v2;
|
|
||||||
v1 = v3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
|
static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
|
||||||
if(sparseBlockOC == 4) {
|
if(sparseBlockOC == 4) {
|
||||||
packedSparseMatMul = MNNPackedSparseMatMulEpx4;
|
packedSparseMatMul = MNNPackedSparseMatMulEpx4;
|
||||||
|
@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() {
|
||||||
|
|
||||||
gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
|
gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
|
||||||
gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
|
gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
|
||||||
gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
|
|
||||||
gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
|
|
||||||
gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
|
|
||||||
gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
|
|
||||||
gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
|
gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
|
||||||
gCoreFunction->MNNMatrixSub = MNNMatrixSub;
|
gCoreFunction->MNNMatrixSub = MNNMatrixSub;
|
||||||
gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
|
gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
|
||||||
|
@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() {
|
||||||
gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
|
gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
|
||||||
gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
|
gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
|
||||||
gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
|
gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
|
||||||
|
#ifdef MNN_USE_NEON
|
||||||
|
gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel;
|
||||||
|
#endif
|
||||||
gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
|
gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
|
||||||
gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
|
gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
|
||||||
gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;
|
gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;
|
||||||
|
@ -3514,4 +3316,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth
|
||||||
areaOffset,
|
areaOffset,
|
||||||
};
|
};
|
||||||
MNNPackInt8C2(dst, src, area, depth, offset);
|
MNNPackInt8C2(dst, src, area, depth, offset);
|
||||||
}
|
}
|
||||||
|
|
|
@ -170,9 +170,6 @@ struct MatMulParam {
|
||||||
void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
|
void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
|
||||||
|
|
||||||
void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
|
void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
|
||||||
void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
|
|
||||||
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter);
|
|
||||||
void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
|
|
||||||
void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
|
void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
|
||||||
|
|
||||||
struct SumByAxisParams {
|
struct SumByAxisParams {
|
||||||
|
@ -267,15 +264,10 @@ struct CoreFunctions {
|
||||||
void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
|
void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
|
||||||
|
|
||||||
// NC4HW4's compute function
|
// NC4HW4's compute function
|
||||||
void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
|
||||||
void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep);
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
|
void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
|
||||||
void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post);
|
|
||||||
void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
|
|
||||||
void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post);
|
|
||||||
void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
||||||
size_t bStride, size_t height);
|
size_t bStride, size_t height);
|
||||||
void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
||||||
|
@ -309,6 +301,9 @@ struct CoreFunctions {
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
||||||
void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
|
void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
||||||
|
void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr;
|
||||||
void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
|
void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
|
||||||
void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
|
void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
|
||||||
int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
|
int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
|
||||||
|
|
|
@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
|
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) {
|
||||||
auto weightDst = weight->host<uint8_t>();
|
auto weightDst = weight->host<uint8_t>();
|
||||||
memset(weightDst, 0, weight->size());
|
memset(weightDst, 0, weight->size());
|
||||||
if (SRC_UNIT > pack) {
|
int kernelCountUnit = weight->shape()[1];
|
||||||
|
int blockL = kernelCountUnit / blockNum;
|
||||||
|
int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL;
|
||||||
|
int strideInside = weight->stride(0) / blockNum;
|
||||||
|
if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT};
|
||||||
auto icDivU = UP_DIV(ic, pack);
|
auto icDivU = UP_DIV(ic, pack);
|
||||||
for (int k = 0; k < kernelCount; ++k) {
|
for (int k = 0; k < kernelCount; ++k) {
|
||||||
const auto srcK = weightSrc + k;
|
const auto srcK = weightSrc + k;
|
||||||
|
@ -58,31 +62,37 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
|
||||||
const int ySubOutSide = yIndex / (SRC_UNIT / pack);
|
const int ySubOutSide = yIndex / (SRC_UNIT / pack);
|
||||||
const int ySubInSide = yIndex % (SRC_UNIT / pack);
|
const int ySubInSide = yIndex % (SRC_UNIT / pack);
|
||||||
|
|
||||||
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
|
int blockId = ySubOutSide / blockL;
|
||||||
|
int blockInsideId = ySubOutSide % blockL;
|
||||||
|
|
||||||
|
auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide;
|
||||||
const auto srcY = srcK + y * kernelCount;
|
const auto srcY = srcK + y * kernelCount;
|
||||||
for (int x = 0; x < oc; ++x) {
|
for (int x = 0; x < oc; ++x) {
|
||||||
const int xOutSide = x / UNIT;
|
const int xOutSide = x / UNIT;
|
||||||
const int xInSide = x % UNIT;
|
const int xInSide = x % UNIT;
|
||||||
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
|
const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
|
||||||
const int srcIndex = x * kernelCount * ic;
|
const int srcIndex = x * kernelCount * ic;
|
||||||
dstY[dstIndex] = srcY[srcIndex];
|
dstY[dstIndex] = srcY[srcIndex];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT};
|
||||||
for (int k = 0; k < kernelCount; ++k) {
|
for (int k = 0; k < kernelCount; ++k) {
|
||||||
auto icDivU = UP_DIV(ic, SRC_UNIT);
|
auto icDivU = UP_DIV(ic, SRC_UNIT);
|
||||||
const auto srcK = weightSrc + k;
|
const auto srcK = weightSrc + k;
|
||||||
for (int y = 0; y < ic; ++y) {
|
for (int y = 0; y < ic; ++y) {
|
||||||
const int yOutSide = y / SRC_UNIT;
|
const int yOutSide = y / SRC_UNIT;
|
||||||
const int yInSide = y % SRC_UNIT;
|
const int yInSide = y % SRC_UNIT;
|
||||||
|
|
||||||
|
int blockId = (yOutSide + k * icDivU) / blockL;
|
||||||
|
int blockInsideId = (yOutSide + k * icDivU) % blockL;
|
||||||
|
|
||||||
auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide;
|
auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide;
|
||||||
const auto srcY = srcK + y * kernelCount;
|
const auto srcY = srcK + y * kernelCount;
|
||||||
for (int x = 0; x < oc; ++x) {
|
for (int x = 0; x < oc; ++x) {
|
||||||
const int xOutSide = x / UNIT;
|
const int xOutSide = x / UNIT;
|
||||||
const int xInSide = x % UNIT;
|
const int xInSide = x % UNIT;
|
||||||
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
|
const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
|
||||||
const int srcIndex = x * kernelCount * ic;
|
const int srcIndex = x * kernelCount * ic;
|
||||||
dstY[dstIndex] = srcY[srcIndex];
|
dstY[dstIndex] = srcY[srcIndex];
|
||||||
}
|
}
|
||||||
|
@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
|
||||||
|
|
||||||
static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
|
static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
|
||||||
const std::shared_ptr<Tensor>& weightOrigin,
|
const std::shared_ptr<Tensor>& weightOrigin,
|
||||||
std::shared_ptr<Tensor>& weight) {
|
std::shared_ptr<Tensor>& weight, int blockNum) {
|
||||||
|
MNN_ASSERT(blockNum > 0);
|
||||||
auto core = static_cast<CPUBackend*>(bn)->int8Functions();
|
auto core = static_cast<CPUBackend*>(bn)->int8Functions();
|
||||||
auto gcore = static_cast<CPUBackend*>(bn)->functions();
|
auto gcore = static_cast<CPUBackend*>(bn)->functions();
|
||||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||||
|
@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
|
||||||
MNN_ERROR("Memory not enough");
|
MNN_ERROR("Memory not enough");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
|
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
|
static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) {
|
||||||
// common parameters
|
// common parameters
|
||||||
int outputCount = conv2d->common()->outputCount();
|
int outputCount = conv2d->common()->outputCount();
|
||||||
auto core = static_cast<CPUBackend*>(backend)->functions();
|
auto core = static_cast<CPUBackend*>(backend)->functions();
|
||||||
|
@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
|
||||||
dequantCnt /= 2;
|
dequantCnt /= 2;
|
||||||
}
|
}
|
||||||
int blockNum = dequantCnt / outputCount;
|
int blockNum = dequantCnt / outputCount;
|
||||||
|
blocknumPtr[0] = blockNum;
|
||||||
int scaleSize = blockNum * ocUp4; // pack size.
|
int scaleSize = blockNum * ocUp4; // pack size.
|
||||||
int blockSize = LSize / blockNum;
|
int blockSize = LSize / blockNum;
|
||||||
int originOffset = 0;
|
int originOffset = 0;
|
||||||
|
@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
|
||||||
auto gcore = static_cast<CPUBackend*>(backend)->functions();
|
auto gcore = static_cast<CPUBackend*>(backend)->functions();
|
||||||
mResourceInt8.reset(new CPUConvolution::ResourceInt8);
|
mResourceInt8.reset(new CPUConvolution::ResourceInt8);
|
||||||
mResourceInt8->mDynamicQuant = true;
|
mResourceInt8->mDynamicQuant = true;
|
||||||
GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
|
int blockNum = 1;
|
||||||
|
GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum);
|
||||||
|
mBlockNum = blockNum;
|
||||||
// dynamic quant
|
// dynamic quant
|
||||||
int UNIT, SRC_UNIT, DST_XUNIT;
|
int UNIT, SRC_UNIT, DST_XUNIT;
|
||||||
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
|
||||||
|
@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
|
||||||
// Pack two int4-weight to one int8-weight.
|
// Pack two int4-weight to one int8-weight.
|
||||||
int cnt = lP * hP / 4;
|
int cnt = lP * hP / 4;
|
||||||
int L = lU * lP;
|
int L = lU * lP;
|
||||||
|
int blockL = lU / blockNum;
|
||||||
|
int stride0 = (lP * hP) * hU * blockL;
|
||||||
|
int stride1 = (lP * hP) * blockL;
|
||||||
for (int i = 0; i < hU; ++i) {
|
for (int i = 0; i < hU; ++i) {
|
||||||
for (int j = 0; j < lU; ++j) {
|
for (int j = 0; j < lU; ++j) {
|
||||||
|
int blockId = j / blockL;
|
||||||
|
int blockkInsideId = j % blockL;
|
||||||
for (int k = 0; k < cnt; ++k) {
|
for (int k = 0; k < cnt; ++k) {
|
||||||
int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
|
int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k);
|
||||||
|
|
||||||
int hpId0 = (2 * k + 1) / lP;
|
int hpId0 = (2 * k + 1) / lP;
|
||||||
int lpId0 = (2 * k) % lP;
|
int lpId0 = (2 * k) % lP;
|
||||||
|
@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
|
||||||
tmpWeight[2 * i + 1] = s1;
|
tmpWeight[2 * i + 1] = s1;
|
||||||
}
|
}
|
||||||
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
|
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
|
||||||
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
|
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
|
||||||
if(!mValid) {
|
if(!mValid) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
|
||||||
mResourceInt8->mWeightInt8 = weightLow;
|
mResourceInt8->mWeightInt8 = weightLow;
|
||||||
} else {
|
} else {
|
||||||
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
|
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
|
||||||
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
|
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
|
||||||
if(!mValid) {
|
if(!mValid) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
|
||||||
DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
|
DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
|
||||||
std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
|
std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
|
||||||
auto convOp = op->main_as_Convolution2D();
|
auto convOp = op->main_as_Convolution2D();
|
||||||
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
|
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum);
|
||||||
if(!mValid) {
|
if(!mValid) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
|
||||||
|
|
||||||
mDivides.resize(threads+1);
|
mDivides.resize(threads+1);
|
||||||
mDivides[0] = 0;
|
mDivides[0] = 0;
|
||||||
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1);
|
||||||
for (int i = 0; i < mDivides.size(); ++i) {
|
for (int i = 0; i < mDivides.size(); ++i) {
|
||||||
mDivides[i] *= part;
|
mDivides[i] *= part;
|
||||||
}
|
}
|
||||||
|
@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
|
||||||
mThreadNums = ALIMIN(threads, mTileCount);
|
mThreadNums = ALIMIN(threads, mTileCount);
|
||||||
mDivides.resize(threads+1);
|
mDivides.resize(threads+1);
|
||||||
mDivides[0] = 0;
|
mDivides[0] = 0;
|
||||||
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1);
|
||||||
}
|
}
|
||||||
int ocUp4 = ROUND_UP(outC, gcore->pack);
|
int ocUp4 = ROUND_UP(outC, gcore->pack);
|
||||||
// int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
|
// int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
|
||||||
|
@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
|
|
||||||
auto inputDataPtr = input->host<int8_t>();
|
auto inputDataPtr = input->host<int8_t>();
|
||||||
auto im2colPtr = mTempIm2ColBuffer->host<int8_t>();
|
auto im2colPtr = mTempIm2ColBuffer->host<int8_t>();
|
||||||
|
if (SRC_UNIT > PackUnit) {
|
||||||
|
memset(im2colPtr, 0, mTempIm2ColBuffer->size());
|
||||||
|
}
|
||||||
const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
|
const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
|
||||||
auto srcKernelSumPtr = mTempSrcSum.data();
|
auto srcKernelSumPtr = mTempSrcSum.data();
|
||||||
auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
|
auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
|
||||||
|
@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
dequantscale = range / 255.0f;
|
dequantscale = range / 255.0f;
|
||||||
zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
|
zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
|
||||||
}
|
}
|
||||||
std::vector<float>qsVec(PackUnit, quantscale);
|
|
||||||
auto sizeDiv = UP_DIV(inputsize, PackUnit);
|
auto sizeDiv = UP_DIV(inputsize, PackUnit);
|
||||||
int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
|
int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
|
||||||
if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
|
if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
|
||||||
|
@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
|
const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
|
||||||
const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
|
const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
|
||||||
const auto weightDequanBiasTid = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
|
const auto weightDequanBiasTid = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
|
||||||
const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes);
|
const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * blockL * SRC_UNIT * weightBytes);
|
||||||
if (mBlockNum == 1) {
|
if (mBlockNum == 1) {
|
||||||
quanParam.biasFloat = biasFloatTid;
|
quanParam.biasFloat = biasFloatTid;
|
||||||
quanParam.scale = scaleFloatTid;
|
quanParam.scale = scaleFloatTid;
|
||||||
|
@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
|
quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
|
||||||
quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
|
quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
|
||||||
|
|
||||||
mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
|
mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
|
||||||
}
|
}
|
||||||
ptrX += (step * mBlockNum);
|
ptrX += (step * mBlockNum);
|
||||||
realDstCount-=step;
|
realDstCount-=step;
|
||||||
|
|
|
@ -24,7 +24,7 @@ public:
|
||||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
||||||
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
|
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
|
||||||
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
|
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||||
|
@ -74,7 +74,7 @@ private:
|
||||||
std::vector<int32_t> mDivides;
|
std::vector<int32_t> mDivides;
|
||||||
|
|
||||||
int mThreadNums;
|
int mThreadNums;
|
||||||
int mBlockNum;
|
int mBlockNum = 1;
|
||||||
int mOcPerThread;
|
int mOcPerThread;
|
||||||
bool mSplitByOc;
|
bool mSplitByOc;
|
||||||
bool mUseBatchQuan;
|
bool mUseBatchQuan;
|
||||||
|
|
|
@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size
|
||||||
|
|
||||||
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep) {
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
|
||||||
int dx, fx, fy;
|
int dx, fx, fy;
|
||||||
|
auto biasValue = Vec4::load(bias);
|
||||||
|
auto minF = Vec4(parameters[0]);
|
||||||
|
auto maxF = Vec4(parameters[1]);
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
auto srcY = src + y * srcHStep;
|
auto srcY = src + y * srcHStep;
|
||||||
auto dstY = dst + y * dstHStep;
|
auto dstY = dst + y * dstHStep;
|
||||||
for (dx = 0; dx < width; ++dx) {
|
for (dx = 0; dx < width; ++dx) {
|
||||||
float* dst_x = dstY + dx * 4;
|
float* dst_x = dstY + dx * 4;
|
||||||
Vec4 dstValue(0.0f);
|
auto dstValue = biasValue;
|
||||||
const float* src_z = srcY + src_w_setup * dx;
|
const float* src_z = srcY + src_w_setup * dx;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
|
@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
|
||||||
dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
|
dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue = Vec4::min(dstValue, maxF);
|
||||||
|
dstValue = Vec4::max(dstValue, minF);
|
||||||
Vec4::save(dst_x, dstValue);
|
Vec4::save(dst_x, dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
Vec4 dstValue(0.0f);
|
|
||||||
const float* src_z = src;
|
|
||||||
const float* weight_z = weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
const float* src_y = src_z + fy * dilateY_step;
|
|
||||||
const float* weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
const float* weight_x = weight_y + 4 * fx;
|
|
||||||
const float* src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Vec4::save(dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
|
void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
|
||||||
size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
|
size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
|
||||||
size_t dilateX_step, size_t dilateY_step, float* alpha) {
|
size_t dilateX_step, size_t dilateY_step, float* alpha) {
|
||||||
|
|
|
@ -16,17 +16,19 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
|
||||||
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep);
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
|
|
||||||
void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
|
void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
||||||
void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
|
void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
|
||||||
|
|
||||||
|
void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
|
|
||||||
void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
||||||
size_t bStride, size_t height);
|
size_t bStride, size_t height);
|
||||||
void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
||||||
|
|
|
@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
|
mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
|
||||||
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
|
|
||||||
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
|
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
|
||||||
std::vector<int> divides(numberThread+1);
|
std::vector<int> divides(numberThread+1);
|
||||||
divides[0] = 0;
|
divides[0] = 0;
|
||||||
rt->computeDivideSizes(matrixSizeE, divides.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1);
|
||||||
mUnits.resize(numberThread);
|
mUnits.resize(numberThread);
|
||||||
for (int i = 0; i < numberThread; ++i) {
|
for (int i = 0; i < numberThread; ++i) {
|
||||||
int planeStart = divides[i];
|
int planeStart = divides[i];
|
||||||
|
@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
auto ocDiv = UP_DIV(ocC4, hDiv);
|
auto ocDiv = UP_DIV(ocC4, hDiv);
|
||||||
std::vector<int> divides(numberThread+1);
|
std::vector<int> divides(numberThread+1);
|
||||||
divides[0] = 0;
|
divides[0] = 0;
|
||||||
rt->computeDivideSizes(ocDiv, divides.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1);
|
||||||
mUnits.resize(numberThread);
|
mUnits.resize(numberThread);
|
||||||
for (int i = 0; i < numberThread; ++i) {
|
for (int i = 0; i < numberThread; ++i) {
|
||||||
int ocStart = divides[i] * hDiv;
|
int ocStart = divides[i] * hDiv;
|
||||||
|
|
|
@ -1,221 +0,0 @@
|
||||||
//
|
|
||||||
// ConvolutionDepthwise3x3.cpp
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/3.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
|
|
||||||
#include "backend/cpu/CPUBackend.hpp"
|
|
||||||
#include "CommonOptFunction.h"
|
|
||||||
#include "core/Concurrency.h"
|
|
||||||
#include "core/Macro.h"
|
|
||||||
|
|
||||||
namespace MNN {
|
|
||||||
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
|
|
||||||
mResource = resource;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
|
|
||||||
const float *originWeight, size_t originWeightSize, const float *bias,
|
|
||||||
size_t biasSize)
|
|
||||||
: CPUConvolution(common, b) {
|
|
||||||
MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
|
|
||||||
MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
|
|
||||||
MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
|
|
||||||
mResource.reset(new Resource);
|
|
||||||
mResource->backend = b;
|
|
||||||
auto core = static_cast<CPUBackend*>(b)->functions();
|
|
||||||
auto pack = core->pack;
|
|
||||||
auto bytes = core->bytes;
|
|
||||||
auto success = mResource->copyBiasAlign(bias, biasSize);
|
|
||||||
if (!success) {
|
|
||||||
mValid = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto channel = common->outputCount();
|
|
||||||
auto channelC4 = UP_DIV(channel, pack);
|
|
||||||
auto unitSize = channelC4 * pack * 3 * 4;
|
|
||||||
mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
|
|
||||||
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
|
|
||||||
if (!mValid) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
AutoStorage<float> tempWeightStorge;
|
|
||||||
auto weightHost = mResource->mWeight->host<float>();
|
|
||||||
if (bytes < 4) {
|
|
||||||
// Lowp need extra float storage for transform
|
|
||||||
tempWeightStorge.reset(unitSize);
|
|
||||||
if (nullptr == tempWeightStorge.get()) {
|
|
||||||
mValid = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
weightHost = tempWeightStorge.get();
|
|
||||||
}
|
|
||||||
::memset(weightHost, 0, unitSize * sizeof(float));
|
|
||||||
/* 1D-Winograd F(2,3) and tiling */
|
|
||||||
for (int c = 0; c < channel; ++c) {
|
|
||||||
auto cIndex = c / pack;
|
|
||||||
auto cRemain = c % pack;
|
|
||||||
auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
|
|
||||||
auto weightSrcZ = originWeight + c * 9;
|
|
||||||
for (int y = 0; y < 3; ++y) {
|
|
||||||
auto k0 = weightSrcZ[3 * y + 0];
|
|
||||||
auto k1 = weightSrcZ[3 * y + 1];
|
|
||||||
auto k2 = weightSrcZ[3 * y + 2];
|
|
||||||
|
|
||||||
auto m0 = k0;
|
|
||||||
auto m1 = 0.5f * (k0 + k1 + k2);
|
|
||||||
auto m2 = 0.5f * (k0 - k1 + k2);
|
|
||||||
auto m3 = k2;
|
|
||||||
|
|
||||||
weightDstZ[(y * 4 + 0) * pack] = m0;
|
|
||||||
weightDstZ[(y * 4 + 1) * pack] = m1;
|
|
||||||
weightDstZ[(y * 4 + 2) * pack] = m2;
|
|
||||||
weightDstZ[(y * 4 + 3) * pack] = m3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (bytes < 4) {
|
|
||||||
core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
|
|
||||||
// Do nothing
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
|
|
||||||
if (nullptr == dst) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
|
|
||||||
*dst = dstExe;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
||||||
CPUConvolution::onResize(inputs, outputs);
|
|
||||||
const int numberThread = ((CPUBackend *)backend())->threadNumber();
|
|
||||||
auto output = outputs[0];
|
|
||||||
auto owUnit = UP_DIV(output->width(), 2);
|
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
||||||
// 3 cacheline
|
|
||||||
mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
|
|
||||||
auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
|
|
||||||
if (!valid) {
|
|
||||||
return OUT_OF_MEMORY;
|
|
||||||
}
|
|
||||||
backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
|
|
||||||
auto iw = inputs[0]->width();
|
|
||||||
mSourceStartX = UP_DIV(mPadX, 2);
|
|
||||||
mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX);
|
|
||||||
mPostParameters = getPostParameters();
|
|
||||||
// auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
|
|
||||||
// FUNC_PRINT_ALL(rate, f);
|
|
||||||
|
|
||||||
int channelC4 = UP_DIV(inputs[0]->channel(), core->pack);
|
|
||||||
int batch = inputs[0]->batch();
|
|
||||||
auto total = channelC4 * batch;
|
|
||||||
|
|
||||||
mDivides.resize(numberThread+1);
|
|
||||||
mDivides[0] = 0;
|
|
||||||
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1);
|
|
||||||
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
|
|
||||||
const std::vector<Tensor *> &outputs) {
|
|
||||||
auto input = inputs[0];
|
|
||||||
auto output = outputs[0];
|
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
|
||||||
|
|
||||||
int channelC4 = UP_DIV(input->channel(), core->pack);
|
|
||||||
int initSize = std::min(input->height(), 2);
|
|
||||||
int batch = input->batch();
|
|
||||||
int ow = output->width();
|
|
||||||
int oh = output->height();
|
|
||||||
int owUnit = UP_DIV(ow, 2);
|
|
||||||
|
|
||||||
auto iw = input->width();
|
|
||||||
auto ih = input->height();
|
|
||||||
auto kernelOrigin = mResource->mWeight->host<uint8_t>();
|
|
||||||
|
|
||||||
/*oy-mPadY>=0*/
|
|
||||||
int middelYStart = mPadY;
|
|
||||||
|
|
||||||
/*oy-mPadY+3-1 < ih*/
|
|
||||||
int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
|
|
||||||
|
|
||||||
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
|
||||||
auto maxKernelH = std::min(mPadY + ih, 3);
|
|
||||||
auto inputOrigin = input->host<uint8_t>();
|
|
||||||
auto outputOrigin = output->host<uint8_t>();
|
|
||||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
|
||||||
auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
|
|
||||||
for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) {
|
|
||||||
int z = index / batch;
|
|
||||||
auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
|
|
||||||
auto inputZ = inputOrigin + core->pack * index * iw * ih * core->bytes;
|
|
||||||
auto outputZ = outputOrigin + core->pack * index * ow * oh * core->bytes;
|
|
||||||
auto kernelZ = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
|
|
||||||
auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
|
|
||||||
auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
|
|
||||||
auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
|
|
||||||
|
|
||||||
float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
|
|
||||||
|
|
||||||
// Init
|
|
||||||
for (int i = 0; i < initSize; ++i) {
|
|
||||||
core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
|
|
||||||
mSourceEndX);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute Top
|
|
||||||
for (int y = 0; y < middelYStart; ++y) {
|
|
||||||
auto outputY = outputZ + y * core->bytes * core->pack * ow;
|
|
||||||
int cacheLineSize = y - mPadY + maxKernelH;
|
|
||||||
if (cacheLineSize <= 0) {
|
|
||||||
::memset(outputY, 0, core->bytes * ow * core->pack);
|
|
||||||
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
|
|
||||||
cacheLineSize = std::min(cacheLineSize, ih);
|
|
||||||
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute Mid
|
|
||||||
for (int y = middelYStart; y < middelYEnd; ++y) {
|
|
||||||
auto outputY = outputZ + y * core->bytes * core->pack * ow;
|
|
||||||
auto iy = y - mPadY + 2;
|
|
||||||
core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
|
|
||||||
mSourceEndX);
|
|
||||||
// FUNC_PRINT(ow);
|
|
||||||
core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
|
|
||||||
|
|
||||||
auto temp = cacheLine[0];
|
|
||||||
cacheLine[0] = cacheLine[1];
|
|
||||||
cacheLine[1] = cacheLine[2];
|
|
||||||
cacheLine[2] = temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute Bottom
|
|
||||||
for (int y = middelYEnd; y < oh; ++y) {
|
|
||||||
auto outputY = outputZ + y * core->bytes * core->pack * ow;
|
|
||||||
int cacheLineSize = (ih - y + mPadY);
|
|
||||||
if (cacheLineSize <= 0) {
|
|
||||||
::memset(outputY, 0, ow * core->bytes * core->pack);
|
|
||||||
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
|
|
||||||
cacheLine[0] = cacheLine[1];
|
|
||||||
cacheLine[1] = cacheLine[2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} MNN_CONCURRENCY_END();
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
} // namespace MNN
|
|
|
@ -1,37 +0,0 @@
|
||||||
//
|
|
||||||
// ConvolutionDepthwise3x3.hpp
|
|
||||||
// MNN
|
|
||||||
//
|
|
||||||
// Created by MNN on 2019/4/3.
|
|
||||||
// Copyright © 2018, Alibaba Group Holding Limited
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifndef ConvolutionDepthwise3x3_hpp
|
|
||||||
#define ConvolutionDepthwise3x3_hpp
|
|
||||||
|
|
||||||
#include "backend/cpu/CPUConvolution.hpp"
|
|
||||||
|
|
||||||
namespace MNN {
|
|
||||||
class ConvolutionDepthwise3x3 : public CPUConvolution {
|
|
||||||
public:
|
|
||||||
ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
|
||||||
size_t originWeightSize, const float *bias, size_t biasSize);
|
|
||||||
virtual ~ConvolutionDepthwise3x3();
|
|
||||||
|
|
||||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
|
||||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
|
||||||
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
|
|
||||||
private:
|
|
||||||
ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
|
|
||||||
|
|
||||||
std::shared_ptr<Resource> mResource;
|
|
||||||
|
|
||||||
std::unique_ptr<Tensor> mCacheLine;
|
|
||||||
int mSourceStartX = 0;
|
|
||||||
int mSourceEndX = 0;
|
|
||||||
std::vector<float> mPostParameters;
|
|
||||||
std::vector<int> mDivides;
|
|
||||||
};
|
|
||||||
} // namespace MNN
|
|
||||||
|
|
||||||
#endif /* ConvolutionDepthwise3x3_hpp */
|
|
|
@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
|
||||||
// MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
|
// MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
|
||||||
|
|
||||||
std::vector<int> divides(threadNumber+1);
|
std::vector<int> divides(threadNumber+1);
|
||||||
static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
|
||||||
divides[0] = 0;
|
divides[0] = 0;
|
||||||
auto midBuffer0Bytes = srcUnit2 * pack * bytes;
|
auto midBuffer0Bytes = srcUnit2 * pack * bytes;
|
||||||
bool allow_x86_bf16_winograd = true;
|
bool allow_x86_bf16_winograd = true;
|
||||||
|
@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
std::vector<int> postDivides(threadNumber+1);
|
std::vector<int> postDivides(threadNumber+1);
|
||||||
static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
|
||||||
postDivides[0] = 0;
|
postDivides[0] = 0;
|
||||||
|
|
||||||
mPostFunction.first = threadNumber;
|
mPostFunction.first = threadNumber;
|
||||||
|
|
|
@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
|
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
|
||||||
std::vector<int> ocC4ParralSize(threadNumber + 1);
|
std::vector<int> ocC4ParralSize(threadNumber + 1);
|
||||||
ocC4ParralSize[0] = 0;
|
ocC4ParralSize[0] = 0;
|
||||||
rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
|
||||||
mFunction.second = [=](int placeholder) {
|
mFunction.second = [=](int placeholder) {
|
||||||
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
|
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
|
||||||
|
@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
}
|
}
|
||||||
info[0] = 1;
|
info[0] = 1;
|
||||||
int hw4Stride = info[1] * unit * bytes;
|
int hw4Stride = info[1] * unit * bytes;
|
||||||
rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
|
||||||
im2colParallelSize[0] = 0;
|
im2colParallelSize[0] = 0;
|
||||||
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
|
||||||
int threadEL[4];
|
int threadEL[4];
|
||||||
|
@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
std::vector<int> divides(threadNumber + 1);
|
std::vector<int> divides(threadNumber + 1);
|
||||||
divides[0] = 0;
|
divides[0] = 0;
|
||||||
|
|
||||||
static_cast<const CPURuntime*>(static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1);
|
static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
|
||||||
|
|
||||||
mFunction.second = [=](int tId) {
|
mFunction.second = [=](int tId) {
|
||||||
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||||
|
|
|
@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
|
||||||
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
|
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
|
||||||
const int bytes = ((post->useInt8 == 1) ? 1 : 4);
|
const int bytes = ((post->useInt8 == 1) ? 1 : 4);
|
||||||
float fp32min = 0, fp32max = 0;
|
float fp32min = 0, fp32max = 0;
|
||||||
// if (0 == post->useInt8) {
|
int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
// fp32min = (post->fp32minmax)[0];
|
|
||||||
// fp32max = (post->fp32minmax)[1];
|
|
||||||
// }
|
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
|
||||||
int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
const auto srcSumPtr = post->srcKernelSum;
|
const auto srcSumPtr = post->srcKernelSum;
|
||||||
if (0 == post->useInt8 && post->fp32minmax) {
|
if (0 == post->useInt8 && post->fp32minmax) {
|
||||||
|
@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
|
||||||
uint32_t c = 0xf;
|
uint32_t c = 0xf;
|
||||||
const int bytes = 4;
|
const int bytes = 4;
|
||||||
float fp32min = 0, fp32max = 0;
|
float fp32min = 0, fp32max = 0;
|
||||||
int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
MNN_ASSERT(post->useInt8==0);
|
MNN_ASSERT(post->useInt8==0);
|
||||||
if (post->fp32minmax) {
|
if (post->fp32minmax) {
|
||||||
|
@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
|
||||||
}
|
}
|
||||||
|
|
||||||
float* biasPtr = (float*)post->biasFloat;
|
float* biasPtr = (float*)post->biasFloat;
|
||||||
int blockNum = post->blockNum;
|
|
||||||
|
|
||||||
const auto srcSumPtr = post->srcKernelSum;
|
const auto srcSumPtr = post->srcKernelSum;
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
|
|
|
@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
|
||||||
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
int blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
if (post->biasFloat) {
|
if (post->biasFloat) {
|
||||||
biasPtr = post->biasFloat;
|
biasPtr = post->biasFloat;
|
||||||
}
|
}
|
||||||
|
|
||||||
int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
const __m128i mask = _mm_set1_epi8(0xf);
|
const __m128i mask = _mm_set1_epi8(0xf);
|
||||||
|
|
||||||
|
@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
int blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
if (post->biasFloat) {
|
if (post->biasFloat) {
|
||||||
biasPtr = post->biasFloat;
|
biasPtr = post->biasFloat;
|
||||||
|
@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
//printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
|
//printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
|
||||||
if (GEMMINT8_AVX2_E == realDst) {
|
if (GEMMINT8_AVX2_E == realDst) {
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
||||||
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
||||||
auto dst_z = dst + dz * dst_step_tmp;
|
auto dst_z = dst + dz * dst_step_tmp;
|
||||||
|
@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
}
|
}
|
||||||
if (3 == realDst) {
|
if (3 == realDst) {
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
||||||
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
||||||
auto dst_z = dst + dz * dst_step_tmp;
|
auto dst_z = dst + dz * dst_step_tmp;
|
||||||
|
@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
}
|
}
|
||||||
if (2 == realDst) {
|
if (2 == realDst) {
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
||||||
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
||||||
auto dst_z = dst + dz * dst_step_tmp;
|
auto dst_z = dst + dz * dst_step_tmp;
|
||||||
|
@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
}
|
}
|
||||||
if (1 == realDst) {
|
if (1 == realDst) {
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
|
||||||
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
|
||||||
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
|
||||||
auto dst_z = dst + dz * dst_step_tmp;
|
auto dst_z = dst + dz * dst_step_tmp;
|
||||||
|
|
|
@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int
|
||||||
void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
|
void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
|
||||||
void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
|
void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
|
||||||
void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
|
void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
|
||||||
void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
|
|
||||||
void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
|
void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
|
||||||
void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
|
void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
|
||||||
void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
|
void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
|
||||||
|
@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c
|
||||||
size_t length, size_t hSub);
|
size_t length, size_t hSub);
|
||||||
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep);
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
|
void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
__m256 dstValue = _mm256_setzero_ps();
|
|
||||||
const float* src_z = src;
|
|
||||||
const float* weight_z = weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
const float* src_y = src_z + fy * dilateY_step;
|
|
||||||
const float* weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
const float* weight_x = weight_y + PACK_UNIT * fx;
|
|
||||||
const float* src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_mm256_storeu_ps(dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep) {
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
|
||||||
int dx, fx, fy;
|
int dx, fx, fy;
|
||||||
const int unit = 4;
|
const int unit = 4;
|
||||||
int widthUnit = width / unit;
|
int widthUnit = width / unit;
|
||||||
int widthRemain = width - widthUnit * unit;
|
int widthRemain = width - widthUnit * unit;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
|
auto minF = _mm256_broadcast_ss(parameters + 0);
|
||||||
|
auto maxF = _mm256_broadcast_ss(parameters + 1);
|
||||||
|
auto bv = _mm256_loadu_ps(bias);
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
auto srcY = src + y * srcHStep;
|
auto srcY = src + y * srcHStep;
|
||||||
auto dstY = dst + y * dstHStep;
|
auto dstY = dst + y * dstHStep;
|
||||||
for (dx = 0; dx < widthUnit; ++dx) {
|
for (dx = 0; dx < widthUnit; ++dx) {
|
||||||
auto dstValue0 = _mm256_setzero_ps();
|
auto dstValue0 = bv;
|
||||||
auto dstValue1 = _mm256_setzero_ps();
|
auto dstValue1 = bv;
|
||||||
auto dstValue2 = _mm256_setzero_ps();
|
auto dstValue2 = bv;
|
||||||
auto dstValue3 = _mm256_setzero_ps();
|
auto dstValue3 = bv;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
const float* src_y = srcY + fy * dilateY_step;
|
const float* src_y = srcY + fy * dilateY_step;
|
||||||
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
||||||
|
@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
|
dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue0 = _mm256_min_ps(dstValue0, maxF);
|
||||||
|
dstValue1 = _mm256_min_ps(dstValue1, maxF);
|
||||||
|
dstValue2 = _mm256_min_ps(dstValue2, maxF);
|
||||||
|
dstValue3 = _mm256_min_ps(dstValue3, maxF);
|
||||||
|
dstValue0 = _mm256_max_ps(dstValue0, minF);
|
||||||
|
dstValue1 = _mm256_max_ps(dstValue1, minF);
|
||||||
|
dstValue2 = _mm256_max_ps(dstValue2, minF);
|
||||||
|
dstValue3 = _mm256_max_ps(dstValue3, minF);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
||||||
|
@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
}
|
}
|
||||||
for (dx = 0; dx < widthRemain; ++dx) {
|
for (dx = 0; dx < widthRemain; ++dx) {
|
||||||
float* dst_x = dstY + dx * PACK_UNIT;
|
float* dst_x = dstY + dx * PACK_UNIT;
|
||||||
auto dstValue = _mm256_setzero_ps();
|
auto dstValue = bv;
|
||||||
const float* src_z = srcY + src_w_setup * dx;
|
const float* src_z = srcY + src_w_setup * dx;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
|
@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
|
dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue = _mm256_min_ps(dstValue, maxF);
|
||||||
|
dstValue = _mm256_max_ps(dstValue, minF);
|
||||||
_mm256_storeu_ps(dst_x, dstValue);
|
_mm256_storeu_ps(dst_x, dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
|
|
||||||
if (padMode == true) { //padMode == BorderMode_ZEROS
|
|
||||||
if (h < 0 || h >= height || w < 0 || w >= width) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
|
|
||||||
// For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
|
|
||||||
// the leftover reflections degrade to GridSamplePaddingMode_BORDER
|
|
||||||
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
|
|
||||||
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
|
|
||||||
}
|
|
||||||
return h * width * PACK_UNIT + w * PACK_UNIT;
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
|
|
||||||
for (auto ow = 0; ow < outW; ++ow) {
|
|
||||||
auto w = cordPtr[2 * ow + 0];
|
|
||||||
auto h = cordPtr[2 * ow + 1];
|
|
||||||
__m256 interp;
|
|
||||||
|
|
||||||
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
|
|
||||||
int nh = ::floor(h + 0.5f);
|
|
||||||
int nw = ::floor(w + 0.5f);
|
|
||||||
size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
|
|
||||||
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
} else { //sampleMode == GridSampleMode_BILINEAR
|
|
||||||
int w0_h = ::floor(h);
|
|
||||||
int w0_w = ::floor(w);
|
|
||||||
int w1_h = ::ceil(h);
|
|
||||||
int w1_w = ::ceil(w);
|
|
||||||
auto oneV = _mm256_set1_ps(1.0f);
|
|
||||||
|
|
||||||
auto f0 = _mm256_set1_ps((float)w1_w - w);
|
|
||||||
auto f1 = _mm256_sub_ps(oneV, f0);
|
|
||||||
auto h0 = _mm256_set1_ps((float)w1_h - h);
|
|
||||||
auto h1 = _mm256_sub_ps(oneV, h0);
|
|
||||||
|
|
||||||
size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
|
|
||||||
size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
|
|
||||||
size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
|
|
||||||
size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
|
|
||||||
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
__m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00);
|
|
||||||
__m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01);
|
|
||||||
__m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10);
|
|
||||||
__m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11);
|
|
||||||
|
|
||||||
__m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1));
|
|
||||||
__m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1));
|
|
||||||
|
|
||||||
interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1));
|
|
||||||
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
|
void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
|
||||||
Vec8 max = Vec8(-FLT_MAX);
|
Vec8 max = Vec8(-FLT_MAX);
|
||||||
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
|
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
|
||||||
|
@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth,
|
||||||
return ((d * height + h) * width + w) * PACK_UNIT;
|
return ((d * height + h) * width + w) * PACK_UNIT;
|
||||||
}
|
}
|
||||||
|
|
||||||
void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
|
|
||||||
for (auto ow = 0; ow < outW; ++ow) {
|
|
||||||
auto w = cordPtr[3 * ow + 0];
|
|
||||||
auto h = cordPtr[3 * ow + 1];
|
|
||||||
auto d = cordPtr[3 * ow + 2];
|
|
||||||
__m256 interp;
|
|
||||||
|
|
||||||
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
|
|
||||||
int nd = ::floor(d + 0.5f);
|
|
||||||
int nh = ::floor(h + 0.5f);
|
|
||||||
int nw = ::floor(w + 0.5f);
|
|
||||||
size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
|
|
||||||
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
} else { //sampleMode == GridSampleMode_BILINEAR
|
|
||||||
int w0_d = ::floor(d);
|
|
||||||
int w0_h = ::floor(h);
|
|
||||||
int w0_w = ::floor(w);
|
|
||||||
int w1_d = ::ceil(d);
|
|
||||||
int w1_h = ::ceil(h);
|
|
||||||
int w1_w = ::ceil(w);
|
|
||||||
auto oneV = _mm256_set1_ps(1.0f);
|
|
||||||
|
|
||||||
auto f0 = _mm256_set1_ps((float)w1_w - w);
|
|
||||||
auto f1 = _mm256_sub_ps(oneV, f0);
|
|
||||||
auto h0 = _mm256_set1_ps((float)w1_h - h);
|
|
||||||
auto h1 = _mm256_sub_ps(oneV, h0);
|
|
||||||
auto d0 = _mm256_set1_ps((float)w1_d - d);
|
|
||||||
auto d1 = _mm256_sub_ps(oneV, d0);
|
|
||||||
|
|
||||||
size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
|
|
||||||
size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
|
|
||||||
size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
|
|
||||||
size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
|
|
||||||
size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
|
|
||||||
size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
|
|
||||||
size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
|
|
||||||
size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
|
|
||||||
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
__m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
|
|
||||||
__m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
|
|
||||||
__m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
|
|
||||||
__m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
|
|
||||||
__m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
|
|
||||||
__m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
|
|
||||||
__m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
|
|
||||||
__m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
|
|
||||||
|
|
||||||
__m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
|
|
||||||
__m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
|
|
||||||
__m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
|
|
||||||
__m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
|
|
||||||
__m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
|
|
||||||
__m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
|
|
||||||
|
|
||||||
interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
|
|
||||||
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
|
||||||
size_t bStride, size_t height) {
|
size_t bStride, size_t height) {
|
||||||
|
@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) {
|
||||||
coreFunction->MNNMatrixAdd = _AVX_MNNMatrixAdd;
|
coreFunction->MNNMatrixAdd = _AVX_MNNMatrixAdd;
|
||||||
coreFunction->MNNMatrixSub = _AVX_MNNMatrixSub;
|
coreFunction->MNNMatrixSub = _AVX_MNNMatrixSub;
|
||||||
|
|
||||||
coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise;
|
|
||||||
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
|
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
|
||||||
coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
|
coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
|
||||||
coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
|
coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
|
||||||
coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23;
|
|
||||||
coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23;
|
|
||||||
coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit;
|
|
||||||
coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
|
coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
|
||||||
coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
|
coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
|
||||||
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
|
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
|
||||||
|
@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) {
|
||||||
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
|
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
|
||||||
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
|
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
|
||||||
coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
|
coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
|
||||||
coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
|
coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
|
||||||
coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
|
coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
|
||||||
coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
|
coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
|
||||||
coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
|
coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
|
||||||
|
|
|
@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
|
||||||
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
const float* bias_dz = nullptr;
|
const float* bias_dz = nullptr;
|
||||||
const float* extraB_dz = nullptr;
|
const float* extraB_dz = nullptr;
|
||||||
|
@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||||
if (realDst == GEMMINT8_AVX512_E) {
|
if (realDst == GEMMINT8_AVX512_E) {
|
||||||
for (int dz = 0; dz < dzU; ++dz) {
|
for (int dz = 0; dz < dzU; ++dz) {
|
||||||
auto weight_dz = weight + dz * weightZStride;
|
auto weight_dz = weight + dz * weightZStride;
|
||||||
|
@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
|
||||||
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
const float* bias_dz = nullptr;
|
const float* bias_dz = nullptr;
|
||||||
const float* extraB_dz = nullptr;
|
const float* extraB_dz = nullptr;
|
||||||
|
@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int weight_step_Z = static_cast<int32_t>(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
|
int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
|
||||||
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)
|
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)
|
||||||
|
|
||||||
if (realDst == GEMMINT8_AVX512_E) {
|
if (realDst == GEMMINT8_AVX512_E) {
|
||||||
|
|
|
@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
|
||||||
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
const float* bias_dz = nullptr;
|
const float* bias_dz = nullptr;
|
||||||
const float* extraB_dz = nullptr;
|
const float* extraB_dz = nullptr;
|
||||||
|
@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
|
||||||
biasPtr = post->biasFloat;
|
biasPtr = post->biasFloat;
|
||||||
}
|
}
|
||||||
|
|
||||||
int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
|
||||||
|
|
||||||
auto srcKernelSumPtr = post->srcKernelSum;
|
auto srcKernelSumPtr = post->srcKernelSum;
|
||||||
__m512 kernelSum0 = _mm512_setzero_ps();
|
__m512 kernelSum0 = _mm512_setzero_ps();
|
||||||
|
@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
|
||||||
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
|
||||||
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
|
||||||
}
|
}
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
const float* biasPtr = nullptr;
|
const float* biasPtr = nullptr;
|
||||||
const float* bias_dz = nullptr;
|
const float* bias_dz = nullptr;
|
||||||
const float* extraB_dz = nullptr;
|
const float* extraB_dz = nullptr;
|
||||||
|
@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
|
||||||
__m512 kernelSum2 = _mm512_setzero_ps();
|
__m512 kernelSum2 = _mm512_setzero_ps();
|
||||||
__m512 kernelSum3 = _mm512_setzero_ps();
|
__m512 kernelSum3 = _mm512_setzero_ps();
|
||||||
|
|
||||||
int weight_step_Z = static_cast<int32_t>(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
|
int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
|
||||||
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
|
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
|
||||||
const __m512i mask = _mm512_set1_epi8(0xf);
|
const __m512i mask = _mm512_set1_epi8(0xf);
|
||||||
if (GEMMINT8_AVX512_E == realDst) {
|
if (GEMMINT8_AVX512_E == realDst) {
|
||||||
|
|
|
@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
__m512 dstValue = _mm512_setzero_ps();
|
|
||||||
const float* src_z = src;
|
|
||||||
const float* weight_z = weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
const float* src_y = src_z + fy * dilateY_step;
|
|
||||||
const float* weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
const float* weight_x = weight_y + PACK_UNIT * fx;
|
|
||||||
const float* src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_mm512_storeu_ps(dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep) {
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
|
||||||
int dx, fx, fy;
|
int dx, fx, fy;
|
||||||
const int unit = 4;
|
const int unit = 4;
|
||||||
int widthUnit = width / unit;
|
int widthUnit = width / unit;
|
||||||
int widthRemain = width - widthUnit * unit;
|
int widthRemain = width - widthUnit * unit;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
|
auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0));
|
||||||
|
auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1));
|
||||||
|
auto bv = _mm512_loadu_ps(bias);
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
auto srcY = src + y * srcHStep;
|
auto srcY = src + y * srcHStep;
|
||||||
auto dstY = dst + y * dstHStep;
|
auto dstY = dst + y * dstHStep;
|
||||||
for (dx = 0; dx < widthUnit; ++dx) {
|
for (dx = 0; dx < widthUnit; ++dx) {
|
||||||
auto dstValue0 = _mm512_setzero_ps();
|
auto dstValue0 = bv;
|
||||||
auto dstValue1 = _mm512_setzero_ps();
|
auto dstValue1 = bv;
|
||||||
auto dstValue2 = _mm512_setzero_ps();
|
auto dstValue2 = bv;
|
||||||
auto dstValue3 = _mm512_setzero_ps();
|
auto dstValue3 = bv;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
const float* src_y = srcY + fy * dilateY_step;
|
const float* src_y = srcY + fy * dilateY_step;
|
||||||
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
||||||
|
@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
|
||||||
dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
|
dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue0 = _mm512_min_ps(dstValue0, maxF);
|
||||||
|
dstValue1 = _mm512_min_ps(dstValue1, maxF);
|
||||||
|
dstValue2 = _mm512_min_ps(dstValue2, maxF);
|
||||||
|
dstValue3 = _mm512_min_ps(dstValue3, maxF);
|
||||||
|
dstValue0 = _mm512_max_ps(dstValue0, minF);
|
||||||
|
dstValue1 = _mm512_max_ps(dstValue1, minF);
|
||||||
|
dstValue2 = _mm512_max_ps(dstValue2, minF);
|
||||||
|
dstValue3 = _mm512_max_ps(dstValue3, minF);
|
||||||
_mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
_mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
||||||
_mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
_mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
||||||
_mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
_mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
||||||
|
@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
|
||||||
}
|
}
|
||||||
for (dx = 0; dx < widthRemain; ++dx) {
|
for (dx = 0; dx < widthRemain; ++dx) {
|
||||||
float* dst_x = dstY + dx * PACK_UNIT;
|
float* dst_x = dstY + dx * PACK_UNIT;
|
||||||
auto dstValue = _mm512_setzero_ps();
|
auto dstValue = bv;
|
||||||
const float* src_z = srcY + src_w_setup * dx;
|
const float* src_z = srcY + src_w_setup * dx;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
|
@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
|
||||||
dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
|
dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue = _mm512_min_ps(dstValue, maxF);
|
||||||
|
dstValue = _mm512_max_ps(dstValue, minF);
|
||||||
_mm512_storeu_ps(dst_x, dstValue);
|
_mm512_storeu_ps(dst_x, dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
|
|
||||||
if (padMode == true) { //padMode == BorderMode_ZEROS
|
|
||||||
if (h < 0 || h >= height || w < 0 || w >= width) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
|
|
||||||
// For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
|
|
||||||
// the leftover reflections degrade to GridSamplePaddingMode_BORDER
|
|
||||||
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
|
|
||||||
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
|
|
||||||
}
|
|
||||||
return h * width * PACK_UNIT + w * PACK_UNIT;
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
|
|
||||||
for (auto ow = 0; ow < outW; ++ow) {
|
|
||||||
auto w = cordPtr[2 * ow + 0];
|
|
||||||
auto h = cordPtr[2 * ow + 1];
|
|
||||||
__m512 interp;
|
|
||||||
|
|
||||||
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
|
|
||||||
int nh = ::floor(h + 0.5f);
|
|
||||||
int nw = ::floor(w + 0.5f);
|
|
||||||
size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns);
|
|
||||||
_mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
} else { //sampleMode == GridSampleMode_BILINEAR
|
|
||||||
int w0_h = ::floor(h);
|
|
||||||
int w0_w = ::floor(w);
|
|
||||||
int w1_h = ::ceil(h);
|
|
||||||
int w1_w = ::ceil(w);
|
|
||||||
auto oneV = _mm512_set1_ps(1.0f);
|
|
||||||
|
|
||||||
auto f0 = _mm512_set1_ps((float)w1_w - w);
|
|
||||||
auto f1 = _mm512_sub_ps(oneV, f0);
|
|
||||||
auto h0 = _mm512_set1_ps((float)w1_h - h);
|
|
||||||
auto h1 = _mm512_sub_ps(oneV, h0);
|
|
||||||
|
|
||||||
size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
|
|
||||||
size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
|
|
||||||
size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
|
|
||||||
size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
|
|
||||||
|
|
||||||
for (int k = 0; k < channelCUnit; ++k) {
|
|
||||||
__m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00);
|
|
||||||
__m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01);
|
|
||||||
__m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10);
|
|
||||||
__m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11);
|
|
||||||
|
|
||||||
__m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1));
|
|
||||||
__m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1));
|
|
||||||
|
|
||||||
interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1));
|
|
||||||
_mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
|
void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
|
||||||
Vec16 max = Vec16(-FLT_MAX);
|
Vec16 max = Vec16(-FLT_MAX);
|
||||||
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
|
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
|
||||||
|
@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) {
|
||||||
coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
|
coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
|
||||||
coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;
|
coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;
|
||||||
|
|
||||||
coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise;
|
|
||||||
coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
|
coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
|
||||||
coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
|
coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
|
||||||
coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
|
coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
|
||||||
coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23;
|
|
||||||
coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23;
|
|
||||||
coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit;
|
|
||||||
coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
|
coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
|
||||||
coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
|
coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
|
||||||
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
|
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
|
||||||
|
@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) {
|
||||||
coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
|
coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
|
||||||
coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
|
coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
|
||||||
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
|
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
|
||||||
|
coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
|
||||||
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
|
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
|
||||||
|
|
||||||
coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;
|
coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;
|
||||||
|
|
|
@ -11,40 +11,25 @@
|
||||||
|
|
||||||
#define PACK_UNIT 8
|
#define PACK_UNIT 8
|
||||||
|
|
||||||
void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
|
|
||||||
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
|
|
||||||
int fx, fy;
|
|
||||||
__m256 dstValue = _mm256_setzero_ps();
|
|
||||||
const float* src_z = src;
|
|
||||||
const float* weight_z = weight;
|
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
|
||||||
const float* src_y = src_z + fy * dilateY_step;
|
|
||||||
const float* weight_y = weight_z + fy * weight_y_step;
|
|
||||||
for (fx = 0; fx < fw; ++fx) {
|
|
||||||
const float* weight_x = weight_y + PACK_UNIT * fx;
|
|
||||||
const float* src_x = src_y + fx * dilateX_step;
|
|
||||||
dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_mm256_storeu_ps(dst, dstValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep) {
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
|
||||||
int dx, fx, fy;
|
int dx, fx, fy;
|
||||||
const int unit = 4;
|
const int unit = 4;
|
||||||
int widthUnit = width / unit;
|
int widthUnit = width / unit;
|
||||||
int widthRemain = width - widthUnit * unit;
|
int widthRemain = width - widthUnit * unit;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
|
auto minF = _mm256_broadcast_ss(parameters + 0);
|
||||||
|
auto maxF = _mm256_broadcast_ss(parameters + 1);
|
||||||
|
auto bv = _mm256_loadu_ps(bias);
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
auto srcY = src + y * srcHStep;
|
auto srcY = src + y * srcHStep;
|
||||||
auto dstY = dst + y * dstHStep;
|
auto dstY = dst + y * dstHStep;
|
||||||
for (dx = 0; dx < widthUnit; ++dx) {
|
for (dx = 0; dx < widthUnit; ++dx) {
|
||||||
auto dstValue0 = _mm256_setzero_ps();
|
auto dstValue0 = bv;
|
||||||
auto dstValue1 = _mm256_setzero_ps();
|
auto dstValue1 = bv;
|
||||||
auto dstValue2 = _mm256_setzero_ps();
|
auto dstValue2 = bv;
|
||||||
auto dstValue3 = _mm256_setzero_ps();
|
auto dstValue3 = bv;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
const float* src_y = srcY + fy * dilateY_step;
|
const float* src_y = srcY + fy * dilateY_step;
|
||||||
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
|
||||||
|
@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
|
||||||
dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
|
dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue0 = _mm256_min_ps(dstValue0, maxF);
|
||||||
|
dstValue1 = _mm256_min_ps(dstValue1, maxF);
|
||||||
|
dstValue2 = _mm256_min_ps(dstValue2, maxF);
|
||||||
|
dstValue3 = _mm256_min_ps(dstValue3, maxF);
|
||||||
|
dstValue0 = _mm256_max_ps(dstValue0, minF);
|
||||||
|
dstValue1 = _mm256_max_ps(dstValue1, minF);
|
||||||
|
dstValue2 = _mm256_max_ps(dstValue2, minF);
|
||||||
|
dstValue3 = _mm256_max_ps(dstValue3, minF);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
|
||||||
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
|
||||||
|
@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
|
||||||
}
|
}
|
||||||
for (dx = 0; dx < widthRemain; ++dx) {
|
for (dx = 0; dx < widthRemain; ++dx) {
|
||||||
float* dst_x = dstY + dx * PACK_UNIT;
|
float* dst_x = dstY + dx * PACK_UNIT;
|
||||||
auto dstValue = _mm256_setzero_ps();
|
auto dstValue = bv;
|
||||||
const float* src_z = srcY + src_w_setup * dx;
|
const float* src_z = srcY + src_w_setup * dx;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
|
@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
|
||||||
dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
|
dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue = _mm256_min_ps(dstValue, maxF);
|
||||||
|
dstValue = _mm256_max_ps(dstValue, minF);
|
||||||
_mm256_storeu_ps(dst_x, dstValue);
|
_mm256_storeu_ps(dst_x, dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu
|
||||||
void _AVX_ExtraInitFMA(void* functions) {
|
void _AVX_ExtraInitFMA(void* functions) {
|
||||||
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
|
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
|
||||||
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
|
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
|
||||||
coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA;
|
|
||||||
coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA;
|
|
||||||
// sparse conv init
|
// sparse conv init
|
||||||
coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;
|
coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
|
||||||
void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
|
void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
|
||||||
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep);
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
|
||||||
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
|
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
|
||||||
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
|
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
|
||||||
void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
|
void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
|
||||||
|
|
|
@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
|
||||||
if (post->biasFloat) {
|
if (post->biasFloat) {
|
||||||
biasPtr = post->biasFloat;
|
biasPtr = post->biasFloat;
|
||||||
}
|
}
|
||||||
auto blockNum = post->blockNum;
|
|
||||||
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
for (int dz = 0; dz < dst_depth_quad; ++dz) {
|
||||||
const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
|
const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
|
||||||
const float* scale_dz = nullptr;
|
const float* scale_dz = nullptr;
|
||||||
scale_dz = post->scale + dz * GEMM_INT8_UNIT;
|
scale_dz = post->scale + dz * GEMM_INT8_UNIT;
|
||||||
|
@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
|
||||||
if (post->biasFloat) {
|
if (post->biasFloat) {
|
||||||
biasPtr = post->biasFloat;
|
biasPtr = post->biasFloat;
|
||||||
}
|
}
|
||||||
int blockNum = post->blockNum;
|
int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
|
||||||
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
|
||||||
|
|
||||||
auto oneValue = _mm_set1_epi16(1);
|
auto oneValue = _mm_set1_epi16(1);
|
||||||
|
|
|
@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo
|
||||||
|
|
||||||
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
|
||||||
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
|
||||||
size_t srcHStep, size_t dstHStep) {
|
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
|
||||||
int dx, fx, fy;
|
int dx, fx, fy;
|
||||||
const int unit = 8;
|
const int unit = 8;
|
||||||
int widthUnit = width / unit;
|
int widthUnit = width / unit;
|
||||||
|
@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
if (need4) {
|
if (need4) {
|
||||||
widthRemain-=4;
|
widthRemain-=4;
|
||||||
}
|
}
|
||||||
|
auto minF = _mm_set1_ps(parameters[0]);
|
||||||
|
auto maxF = _mm_set1_ps(parameters[1]);
|
||||||
|
auto bv = _mm_loadu_ps(bias);
|
||||||
for (int y = 0; y < height; ++y) {
|
for (int y = 0; y < height; ++y) {
|
||||||
auto srcY = src + y * srcHStep;
|
auto srcY = src + y * srcHStep;
|
||||||
auto dstY = dst + y * dstHStep;
|
auto dstY = dst + y * dstHStep;
|
||||||
for (dx = 0; dx < widthUnit; ++dx) {
|
for (dx = 0; dx < widthUnit; ++dx) {
|
||||||
auto dstValue0 = _mm_set1_ps(0.0f);
|
auto dstValue0 = bv;
|
||||||
auto dstValue1 = _mm_set1_ps(0.0f);
|
auto dstValue1 = bv;
|
||||||
auto dstValue2 = _mm_set1_ps(0.0f);
|
auto dstValue2 = bv;
|
||||||
auto dstValue3 = _mm_set1_ps(0.0f);
|
auto dstValue3 = bv;
|
||||||
auto dstValue4 = _mm_set1_ps(0.0f);
|
auto dstValue4 = bv;
|
||||||
auto dstValue5 = _mm_set1_ps(0.0f);
|
auto dstValue5 = bv;
|
||||||
auto dstValue6 = _mm_set1_ps(0.0f);
|
auto dstValue6 = bv;
|
||||||
auto dstValue7 = _mm_set1_ps(0.0f);
|
auto dstValue7 = bv;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
const float* src_y = srcY + fy * dilateY_step;
|
const float* src_y = srcY + fy * dilateY_step;
|
||||||
const float* weight_y = weight_z + fy * fw * 4;
|
const float* weight_y = weight_z + fy * fw * 4;
|
||||||
|
@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
|
dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue0 = _mm_min_ps(dstValue0, maxF);
|
||||||
|
dstValue1 = _mm_min_ps(dstValue1, maxF);
|
||||||
|
dstValue2 = _mm_min_ps(dstValue2, maxF);
|
||||||
|
dstValue3 = _mm_min_ps(dstValue3, maxF);
|
||||||
|
dstValue4 = _mm_min_ps(dstValue4, maxF);
|
||||||
|
dstValue5 = _mm_min_ps(dstValue5, maxF);
|
||||||
|
dstValue6 = _mm_min_ps(dstValue6, maxF);
|
||||||
|
dstValue7 = _mm_min_ps(dstValue7, maxF);
|
||||||
|
|
||||||
|
dstValue0 = _mm_max_ps(dstValue0, minF);
|
||||||
|
dstValue1 = _mm_max_ps(dstValue1, minF);
|
||||||
|
dstValue2 = _mm_max_ps(dstValue2, minF);
|
||||||
|
dstValue3 = _mm_max_ps(dstValue3, minF);
|
||||||
|
dstValue4 = _mm_max_ps(dstValue4, minF);
|
||||||
|
dstValue5 = _mm_max_ps(dstValue5, minF);
|
||||||
|
dstValue6 = _mm_max_ps(dstValue6, minF);
|
||||||
|
dstValue7 = _mm_max_ps(dstValue7, minF);
|
||||||
|
|
||||||
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
|
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
|
||||||
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
|
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
|
||||||
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
|
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
|
||||||
|
@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
srcY += unit * src_w_setup;
|
srcY += unit * src_w_setup;
|
||||||
}
|
}
|
||||||
if (need4) {
|
if (need4) {
|
||||||
auto dstValue0 = _mm_set1_ps(0.0f);
|
auto dstValue0 = bv;
|
||||||
auto dstValue1 = _mm_set1_ps(0.0f);
|
auto dstValue1 = bv;
|
||||||
auto dstValue2 = _mm_set1_ps(0.0f);
|
auto dstValue2 = bv;
|
||||||
auto dstValue3 = _mm_set1_ps(0.0f);
|
auto dstValue3 = bv;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
const float* src_y = srcY + fy * dilateY_step;
|
const float* src_y = srcY + fy * dilateY_step;
|
||||||
const float* weight_y = weight_z + fy * fw * 4;
|
const float* weight_y = weight_z + fy * fw * 4;
|
||||||
|
@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
|
dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue0 = _mm_min_ps(dstValue0, maxF);
|
||||||
|
dstValue1 = _mm_min_ps(dstValue1, maxF);
|
||||||
|
dstValue2 = _mm_min_ps(dstValue2, maxF);
|
||||||
|
dstValue3 = _mm_min_ps(dstValue3, maxF);
|
||||||
|
|
||||||
|
dstValue0 = _mm_max_ps(dstValue0, minF);
|
||||||
|
dstValue1 = _mm_max_ps(dstValue1, minF);
|
||||||
|
dstValue2 = _mm_max_ps(dstValue2, minF);
|
||||||
|
dstValue3 = _mm_max_ps(dstValue3, minF);
|
||||||
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
|
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
|
||||||
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
|
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
|
||||||
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
|
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
|
||||||
|
@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
}
|
}
|
||||||
for (dx = 0; dx < widthRemain; ++dx) {
|
for (dx = 0; dx < widthRemain; ++dx) {
|
||||||
float* dst_x = dstY + dx * 4;
|
float* dst_x = dstY + dx * 4;
|
||||||
auto dstValue = _mm_set1_ps(0.0f);
|
auto dstValue = bv;
|
||||||
const float* src_z = srcY + src_w_setup * dx;
|
const float* src_z = srcY + src_w_setup * dx;
|
||||||
const float* weight_z = weight;
|
const float* weight_z = weight;
|
||||||
for (fy = 0; fy < fh; ++fy) {
|
for (fy = 0; fy < fh; ++fy) {
|
||||||
|
@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
|
||||||
dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
|
dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
dstValue = _mm_min_ps(dstValue, maxF);
|
||||||
|
dstValue = _mm_max_ps(dstValue, minF);
|
||||||
_mm_storeu_ps(dst_x, dstValue);
|
_mm_storeu_ps(dst_x, dstValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal =
|
||||||
" out_data[gid.x]=(M4)(norm);\n"
|
" out_data[gid.x]=(M4)(norm);\n"
|
||||||
" }\n"
|
" }\n"
|
||||||
"}\n"
|
"}\n"
|
||||||
|
"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
|
||||||
|
" device M4 *out [[buffer(1)]],\n"
|
||||||
|
" constant layernorm_constants& cst [[buffer(2)]],\n"
|
||||||
|
" const device float4 *gamma [[buffer(3)]],\n"
|
||||||
|
" const device float4 *beta [[buffer(4)]],\n"
|
||||||
|
" uint gid [[threadgroup_position_in_grid]],\n"
|
||||||
|
" uint tiisg[[thread_index_in_simdgroup]],\n"
|
||||||
|
" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
|
||||||
|
" int total_idx=(gid*4+sgitg);\n"
|
||||||
|
" int in_idx=total_idx % (cst.inside/4);\n"
|
||||||
|
" int out_idx=total_idx/(cst.inside/4);\n"
|
||||||
|
" auto in_data=in+out_idx*cst.inside/4;\n"
|
||||||
|
" auto out_data=out+out_idx*cst.inside/4;\n"
|
||||||
|
" float square_sum=0.0f;\n"
|
||||||
|
" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
|
||||||
|
" M4 data=in_data[i];\n"
|
||||||
|
" float dis=data.x;\n"
|
||||||
|
" square_sum += dis*dis;\n"
|
||||||
|
" dis=data.y;\n"
|
||||||
|
" square_sum += dis*dis;\n"
|
||||||
|
" dis=data.z;\n"
|
||||||
|
" square_sum += dis*dis;\n"
|
||||||
|
" dis=data.w;\n"
|
||||||
|
" square_sum += dis*dis;\n"
|
||||||
|
" }\n"
|
||||||
|
" square_sum=simd_sum(square_sum);\n"
|
||||||
|
" \n"
|
||||||
|
" if(tiisg == 0) {\n"
|
||||||
|
" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
|
||||||
|
" \n"
|
||||||
|
" float4 norm=var*((float4)in_data[in_idx]);\n"
|
||||||
|
" if(cst.has_gamma_beta) {\n"
|
||||||
|
" out_data[in_idx]=(M4)(norm*gamma[in_idx]+beta[in_idx]);\n"
|
||||||
|
" } else {\n"
|
||||||
|
" out_data[in_idx]=(M4)(norm);\n"
|
||||||
|
" }\n"
|
||||||
|
" }\n"
|
||||||
|
"}\n"
|
||||||
;
|
;
|
||||||
const char* shader_MetalConvolutionWinograd_metal =
|
const char* shader_MetalConvolutionWinograd_metal =
|
||||||
"struct winograd_constants {\n"
|
"struct winograd_constants {\n"
|
||||||
|
@ -1578,6 +1616,60 @@ const char* shader_MetalConvolution1x1_metal =
|
||||||
" //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
|
" //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
|
||||||
" //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
|
" //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
|
||||||
"}\n"
|
"}\n"
|
||||||
|
"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
|
||||||
|
" device M4 *out [[buffer(1)]],\n"
|
||||||
|
" constant conv1x1_constants& cst [[buffer(2)]],\n"
|
||||||
|
" const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
|
||||||
|
" const device M4 *biasTerms [[buffer(4)]],\n"
|
||||||
|
" const device float4 *dequantScale [[buffer(5)]],\n"
|
||||||
|
" uint3 gid[[threadgroup_position_in_grid]],\n"
|
||||||
|
" uint tiisg[[thread_index_in_simdgroup]],\n"
|
||||||
|
" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
|
||||||
|
" int uz=gid.x*2+sgitg;\n"
|
||||||
|
" int rx=gid.y;\n"
|
||||||
|
" auto xy_wt=wt+uz*cst.input_slice;\n"
|
||||||
|
" auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
|
||||||
|
" auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n"
|
||||||
|
" auto biasValue=FLOAT4(biasTerms[uz]);\n"
|
||||||
|
" FLOAT4 result0=FLOAT4(0);\n"
|
||||||
|
" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
|
||||||
|
" for (int bi=0; bi<cst.block_size; bi++) {\n"
|
||||||
|
" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
|
||||||
|
" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
|
||||||
|
" int zmin=bi*block;\n"
|
||||||
|
" int zmax=min(zmin+block,cst.input_slice);\n"
|
||||||
|
" for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
|
||||||
|
" auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
|
||||||
|
" MNN::uchar4x2 w_int4=xy_wt[z];\n"
|
||||||
|
" FLOAT4x4 w_dequant;\n"
|
||||||
|
" for (int i=0; i<4; ++i) {\n"
|
||||||
|
" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
|
||||||
|
" FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
|
||||||
|
" w_dequant[i]=res;\n"
|
||||||
|
" }\n"
|
||||||
|
" result0 += FLOAT4(in40*w_dequant);\n"
|
||||||
|
" \n"
|
||||||
|
"// FLOAT4x4 w_dequant;\n"
|
||||||
|
"// for (int i=0; i<4; ++i) {\n"
|
||||||
|
"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
|
||||||
|
"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
|
||||||
|
"// w_dequant[i]=w4;\n"
|
||||||
|
"// }\n"
|
||||||
|
"//\n"
|
||||||
|
"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
|
||||||
|
"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
|
||||||
|
" }\n"
|
||||||
|
" }\n"
|
||||||
|
" FLOAT4 res;\n"
|
||||||
|
" res.x=simd_sum(result0.x);\n"
|
||||||
|
" res.y=simd_sum(result0.y);\n"
|
||||||
|
" res.z=simd_sum(result0.z);\n"
|
||||||
|
" res.w=simd_sum(result0.w);\n"
|
||||||
|
" /* true */\n"
|
||||||
|
" if (tiisg == 0) {\n"
|
||||||
|
" xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
|
||||||
|
" }\n"
|
||||||
|
"}\n"
|
||||||
"kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
|
"kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
|
||||||
" device M4 *out [[buffer(1)]],\n"
|
" device M4 *out [[buffer(1)]],\n"
|
||||||
" constant conv1x1_constants& cst [[buffer(2)]],\n"
|
" constant conv1x1_constants& cst [[buffer(2)]],\n"
|
||||||
|
@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal =
|
||||||
"// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
|
"// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
|
||||||
"// Macro\n"
|
"// Macro\n"
|
||||||
"// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
|
"// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
|
||||||
|
"#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n"
|
||||||
"#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
|
"#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
|
||||||
"#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
|
"#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
|
||||||
"// whether computer with float32 when store with float16\n"
|
"// whether computer with float32 when store with float16\n"
|
||||||
|
|
|
@ -33,8 +33,8 @@ typedef enum {
|
||||||
/** metal device */
|
/** metal device */
|
||||||
@property (strong, nonatomic, readonly) id<MTLDevice> device;
|
@property (strong, nonatomic, readonly) id<MTLDevice> device;
|
||||||
/** max memory length cound be used in threadgroup */
|
/** max memory length cound be used in threadgroup */
|
||||||
@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
|
|
||||||
@property (assign, nonatomic, readonly) BOOL isIphone;
|
@property (assign, nonatomic, readonly) BOOL isIphone;
|
||||||
|
@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief alloc temp buffer on device
|
* @brief alloc temp buffer on device
|
||||||
|
|
|
@ -79,30 +79,17 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
+ (BOOL)commit_frequent{
|
+ (BOOL)isSimdGroupAvailable{
|
||||||
struct utsname systemInfo;
|
#if TARGET_OS_IPHONE
|
||||||
uname(&systemInfo);
|
if(@available(iOS 14, *)) {
|
||||||
|
return YES;
|
||||||
NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
|
}
|
||||||
|
#endif
|
||||||
if ([deviceString isEqualToString:@"iPhone10,1"]) return YES; //@"iPhone 8 Global";
|
#if TARGET_OS_MAC
|
||||||
if ([deviceString isEqualToString:@"iPhone10,2"]) return YES; //@"iPhone 8 Plus Global";
|
if(@available(macOS 10.14, *)) {
|
||||||
if ([deviceString isEqualToString:@"iPhone10,4"]) return YES; //@"iPhone 8 GSM";
|
return YES;
|
||||||
if ([deviceString isEqualToString:@"iPhone10,5"]) return YES; //@"iPhone 8 Plus GSM";
|
}
|
||||||
if ([deviceString isEqualToString:@"iPhone10,3"]) return YES; //@"A1865/A1902 iPhone X";
|
#endif
|
||||||
if ([deviceString isEqualToString:@"iPhone10,6"]) return YES; //@"Global/A1901 iPhone X";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone11,2"]) return YES; //@"iPhone XS";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone11,4"]) return YES; //@"iPhone XS Max";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone11,6"]) return YES; //@"iPhone XS Max";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone11,8"]) return YES; //@"iPhone XR";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone12,1"]) return YES; //@"iPhone 11";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone12,3"]) return YES; //@"iPhone 11 Pro";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone12,5"]) return YES; //@"iPhone 11 Pro Max";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone12,8"]) return YES; //@"iPhone SE 2";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone13,1"]) return YES; //@"iPhone 12 mini";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone13,2"]) return YES; //@"iPhone 12";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone13,3"]) return YES; //@"iPhone 12 Pro";
|
|
||||||
if ([deviceString isEqualToString:@"iPhone13,4"]) return YES; //@"iPhone 12 Pro Max";
|
|
||||||
return NO;
|
return NO;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -124,8 +111,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
||||||
_device = context->device;
|
_device = context->device;
|
||||||
_cachesFp16 = [NSMutableDictionary dictionary];
|
_cachesFp16 = [NSMutableDictionary dictionary];
|
||||||
_cachesFp32 = [NSMutableDictionary dictionary];
|
_cachesFp32 = [NSMutableDictionary dictionary];
|
||||||
_isCommitEachShader = self.class.commit_frequent;
|
|
||||||
_isIphone = self.class.isIphone;
|
_isIphone = self.class.isIphone;
|
||||||
|
_isSimdGroupAvailable = self.class.isSimdGroupAvailable;
|
||||||
createLibrary(_device, _cachesFp16, true);
|
createLibrary(_device, _cachesFp16, true);
|
||||||
createLibrary(_device, _cachesFp32, false);
|
createLibrary(_device, _cachesFp32, false);
|
||||||
return nil != _device;
|
return nil != _device;
|
||||||
|
|
|
@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
|
||||||
const device int* mask [[buffer(4)]],
|
const device int* mask [[buffer(4)]],
|
||||||
#endif
|
#endif
|
||||||
constant Param& param [[buffer(5)]],
|
constant Param& param [[buffer(5)]],
|
||||||
uint3 gid[[thread_position_in_grid]]) {
|
uint3 gid[[thread_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
const int x = gid.x; // query_seq_len
|
const int x = gid.x; // query_seq_len
|
||||||
const int y = gid.y; // head_num
|
const int y = gid.y; // head_num
|
||||||
const int z = gid.z; // key_seq_len
|
const int z = gid.z; // key_seq_len
|
||||||
|
@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out *= Vscale;
|
out *= Vscale;
|
||||||
output[y + z * head_num] = (T)out;
|
output[y * key_seq_len + z] = (T)out;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]],
|
||||||
}
|
}
|
||||||
output[ x * stride * group + (y * head_dim + z)] = out;
|
output[ x * stride * group + (y * head_dim + z)] = out;
|
||||||
#else
|
#else
|
||||||
device const T *A_offset = input0 + y;
|
device const T *A_offset = input0 + y * value_seq_len;
|
||||||
device const T *B_offset = input1 + offset_head;
|
device const T *B_offset = input1 + offset_head;
|
||||||
device T *Pastvalue_offset = past_value + offset_head;
|
device T *Pastvalue_offset = past_value + offset_head;
|
||||||
float out = 0;
|
float out = 0;
|
||||||
|
|
||||||
for(int i = 0; i < value_seq_len - 1; ++i){
|
for(int i = 0; i < value_seq_len - 1; ++i){
|
||||||
float A = (float)A_offset[i * head_num];
|
float A = (float)A_offset[i];
|
||||||
float B = (float)Pastvalue_offset[i * stride];
|
float B = (float)Pastvalue_offset[i * stride];
|
||||||
|
|
||||||
out += A * B;
|
out += A * B;
|
||||||
}
|
}
|
||||||
out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
|
out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
|
||||||
if (yr == 0) {
|
if (yr == 0) {
|
||||||
Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
|
Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
|
||||||
}
|
}
|
||||||
|
@ -282,6 +284,7 @@ void AttentionBufExecution::reallocKVCache() {
|
||||||
|
|
||||||
|
|
||||||
void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||||
|
|
||||||
auto query = inputs[0];
|
auto query = inputs[0];
|
||||||
auto key = inputs[1];
|
auto key = inputs[1];
|
||||||
auto value = inputs[2];
|
auto value = inputs[2];
|
||||||
|
@ -407,8 +410,8 @@ void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
|
||||||
// For softmax parameter
|
// For softmax parameter
|
||||||
int inside, outside;
|
int inside, outside;
|
||||||
if (mIsDecode) {
|
if (mIsDecode) {
|
||||||
inside = mNumHead;
|
inside = 1;
|
||||||
outside = 1;
|
outside = mNumHead;
|
||||||
} else {
|
} else {
|
||||||
inside = 1;
|
inside = 1;
|
||||||
outside = mCache->mKv_seq_len * mNumHead;
|
outside = mCache->mKv_seq_len * mNumHead;
|
||||||
|
|
|
@ -189,10 +189,7 @@ public:
|
||||||
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
|
||||||
|
|
||||||
void flushEncoder() const;
|
void flushEncoder() const;
|
||||||
id<MTLComputeCommandEncoder> encoder_for_net() const;
|
id<MTLComputeCommandEncoder> encoder_for_net() const;
|
||||||
void addOpEncoder(std::function<void(void)> opEncoder);
|
|
||||||
|
|
||||||
bool isCommandEncoderSet();
|
|
||||||
|
|
||||||
BufferAllocator* getBufferPool() const;
|
BufferAllocator* getBufferPool() const;
|
||||||
EagerBufferAllocator *getStaticBufferPool() const {
|
EagerBufferAllocator *getStaticBufferPool() const {
|
||||||
|
@ -233,11 +230,8 @@ private:
|
||||||
|
|
||||||
const MetalRuntime* mRuntime;
|
const MetalRuntime* mRuntime;
|
||||||
mutable NSUInteger mEncoderCount = 0;
|
mutable NSUInteger mEncoderCount = 0;
|
||||||
mutable bool mOpEncoderSet = false;//whether has set encoder
|
|
||||||
mutable bool mSupportDeferEncode = true;
|
mutable bool mSupportDeferEncode = true;
|
||||||
mutable bool mFrameEncodeCache = false;
|
|
||||||
|
|
||||||
std::vector<std::function<void(void)>> mOpEncoders;
|
|
||||||
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
||||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||||
std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;
|
std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;
|
||||||
|
|
|
@ -229,6 +229,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
|
||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
//MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
|
||||||
|
|
||||||
auto exe = iter->second->onCreate(inputs, op, this, outputs);
|
auto exe = iter->second->onCreate(inputs, op, this, outputs);
|
||||||
if (NULL == exe) {
|
if (NULL == exe) {
|
||||||
|
@ -258,15 +259,8 @@ void MetalBackend::onExecuteBegin() const {
|
||||||
void MetalBackend::onExecuteEnd() const {
|
void MetalBackend::onExecuteEnd() const {
|
||||||
flushEncoder();
|
flushEncoder();
|
||||||
commit_net();
|
commit_net();
|
||||||
|
|
||||||
if(mFrameEncodeCache) {
|
|
||||||
// Prepare for next execute
|
|
||||||
for(auto opEncoder : mOpEncoders) {
|
|
||||||
opEncoder();
|
|
||||||
}
|
|
||||||
mOpEncoderSet = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferAllocator* MetalBackend::getBufferPool() const {
|
BufferAllocator* MetalBackend::getBufferPool() const {
|
||||||
return mCurrentAllocator;
|
return mCurrentAllocator;
|
||||||
}
|
}
|
||||||
|
@ -302,18 +296,11 @@ bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool MetalBackend::isCommandEncoderSet() {
|
|
||||||
return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
|
|
||||||
}
|
|
||||||
|
|
||||||
bool MetalBackend::isCmdBufferCommit() {
|
bool MetalBackend::isCmdBufferCommit() {
|
||||||
auto ctx = (__bridge MNNMetalContext *)context();
|
auto ctx = (__bridge MNNMetalContext *)context();
|
||||||
if(!ctx.isCommitEachShader) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO: set magic number
|
//TODO: set magic number
|
||||||
const int magicNum = 2;
|
const int magicNum = mRuntime->hint().encorderNumForCommit;
|
||||||
mEncoderCount++;
|
mEncoderCount++;
|
||||||
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
|
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
|
||||||
return true;
|
return true;
|
||||||
|
@ -321,12 +308,6 @@ bool MetalBackend::isCmdBufferCommit() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetalBackend::addOpEncoder(std::function<void(void)> opEncoder) {
|
|
||||||
if(mFrameEncodeCache) {
|
|
||||||
mOpEncoders.push_back(opEncoder);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
|
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
|
||||||
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
|
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
|
||||||
// reuse
|
// reuse
|
||||||
|
@ -534,11 +515,7 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
|
||||||
}
|
}
|
||||||
})metal";
|
})metal";
|
||||||
|
|
||||||
void MetalBackend::onResizeBegin() {
|
void MetalBackend::onResizeBegin() {
|
||||||
mFrameEncodeCache = false;
|
|
||||||
mOpEncoderSet = false;
|
|
||||||
mOpEncoders.clear();
|
|
||||||
|
|
||||||
// Abort last inference task if needed
|
// Abort last inference task if needed
|
||||||
flushEncoder();
|
flushEncoder();
|
||||||
_commandBuffer_net = nil;
|
_commandBuffer_net = nil;
|
||||||
|
@ -549,7 +526,6 @@ void MetalBackend::onResizeBegin() {
|
||||||
|
|
||||||
ErrorCode MetalBackend::onResizeEnd() {
|
ErrorCode MetalBackend::onResizeEnd() {
|
||||||
auto ctx = (__bridge MNNMetalContext *)context();
|
auto ctx = (__bridge MNNMetalContext *)context();
|
||||||
mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
|
|
||||||
return mCurrentAllocator->compute();
|
return mCurrentAllocator->compute();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -711,9 +687,8 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
|
||||||
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
|
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
|
||||||
flushEncoder();
|
flushEncoder();
|
||||||
auto ctx = (__bridge MNNMetalContext *)context();
|
auto ctx = (__bridge MNNMetalContext *)context();
|
||||||
if(!mFrameEncodeCache) {
|
commit_net();
|
||||||
commit_net();
|
|
||||||
}
|
|
||||||
_resetDynamicMemory();
|
_resetDynamicMemory();
|
||||||
onCopyBuffer(src, dst, nil, nil);
|
onCopyBuffer(src, dst, nil, nil);
|
||||||
}
|
}
|
||||||
|
@ -789,9 +764,8 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComp
|
||||||
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
|
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
|
||||||
flushEncoder();
|
flushEncoder();
|
||||||
auto ctx = (__bridge MNNMetalContext *)context();
|
auto ctx = (__bridge MNNMetalContext *)context();
|
||||||
if(!mOpEncoderSet) {
|
commit_net();
|
||||||
commit_net();
|
|
||||||
}
|
|
||||||
if (toCpu) {
|
if (toCpu) {
|
||||||
wait();
|
wait();
|
||||||
}
|
}
|
||||||
|
|
|
@ -87,8 +87,16 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
|
||||||
std::string name = "conv1x1_g1z4_w8";
|
std::string name = "conv1x1_g1z4_w8";
|
||||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
|
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
|
||||||
if (mDequantBits == 4) {
|
if (mDequantBits == 4) {
|
||||||
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
|
if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
|
||||||
name = "conv1x1_g1z4_w4";
|
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
|
||||||
|
name = "conv1x1_g1z4_m1w4";
|
||||||
|
mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
|
||||||
|
|
||||||
|
return NO_ERROR;
|
||||||
|
} else {
|
||||||
|
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
|
||||||
|
name = "conv1x1_g1z4_w4";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
|
||||||
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
|
||||||
|
|
|
@ -18,10 +18,6 @@ MetalExecution::MetalExecution(Backend *backend) : Execution(backend) {
|
||||||
ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||||
|
|
||||||
if(backend->isCommandEncoderSet()) {
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto func = [=](){
|
auto func = [=](){
|
||||||
auto encoder = backend->encoder_for_net();
|
auto encoder = backend->encoder_for_net();
|
||||||
this->onEncode(inputs, outputs, encoder);
|
this->onEncode(inputs, outputs, encoder);
|
||||||
|
@ -31,7 +27,6 @@ ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const s
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
func();
|
func();
|
||||||
backend->addOpEncoder(func);
|
|
||||||
|
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ using namespace metal;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct grid_sample_params {
|
struct grid_sample_params {
|
||||||
int batches;
|
int batch;
|
||||||
int channels;
|
int channels;
|
||||||
int inH;
|
int inH;
|
||||||
int inW;
|
int inW;
|
||||||
|
@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]],
|
||||||
device T *output [[buffer(2)]],
|
device T *output [[buffer(2)]],
|
||||||
constant grid_sample_params &p [[buffer(3)]],
|
constant grid_sample_params &p [[buffer(3)]],
|
||||||
uint3 gid [[thread_position_in_grid]]) {
|
uint3 gid [[thread_position_in_grid]]) {
|
||||||
if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches)
|
if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
|
int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
|
||||||
|
@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]],
|
||||||
|
|
||||||
const int channelC4 = (p.channels + 3) / 4;
|
const int channelC4 = (p.channels + 3) / 4;
|
||||||
for (int c = 0; c < channelC4; ++ c) {
|
for (int c = 0; c < channelC4; ++ c) {
|
||||||
auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
|
auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x;
|
||||||
auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
|
auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch;
|
||||||
#if GRID3D
|
#if GRID3D
|
||||||
output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
|
output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,6 +76,7 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
|
||||||
((int *)mShapeBuffer.contents)[3] = (int)has_gamma_beta_;
|
((int *)mShapeBuffer.contents)[3] = (int)has_gamma_beta_;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
bool parallel = (mInside > 32) && ((mInside & 3) == 0);
|
bool parallel = (mInside > 32) && ((mInside & 3) == 0);
|
||||||
if(RMSNorm){
|
if(RMSNorm){
|
||||||
mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
|
mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
|
||||||
|
@ -85,10 +86,17 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
|
||||||
|
|
||||||
auto inside = parallel ? mInside/4 : mInside;
|
auto inside = parallel ? mInside/4 : mInside;
|
||||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
|
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
|
||||||
|
if(context.isSimdGroupAvailable) {
|
||||||
|
if(mOutside == 1 && RMSNorm && parallel) {
|
||||||
|
mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
|
||||||
|
mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||||
|
|
||||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||||
auto input = inputs[0], output = outputs[0];
|
auto input = inputs[0], output = outputs[0];
|
||||||
|
|
|
@ -550,6 +550,7 @@ public:
|
||||||
}
|
}
|
||||||
virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
|
virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
|
||||||
id<MTLComputeCommandEncoder> encoder) override {
|
id<MTLComputeCommandEncoder> encoder) override {
|
||||||
|
|
||||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||||
auto dstTensor = mTensors[cmd->indexes()->data()[0]];
|
auto dstTensor = mTensors[cmd->indexes()->data()[0]];
|
||||||
auto srcTensor = mTensors[cmd->indexes()->data()[1]];
|
auto srcTensor = mTensors[cmd->indexes()->data()[1]];
|
||||||
|
|
|
@ -28,13 +28,10 @@ public:
|
||||||
MTLSize global;
|
MTLSize global;
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
|
|
||||||
std::map<Tensor*, BlitInfo> mTempInputCopy;
|
std::map<Tensor*, BlitInfo> mTempInputCopy;
|
||||||
std::shared_ptr<Tensor> mTempOutput;
|
|
||||||
bool mNeedZero = false;
|
bool mNeedZero = false;
|
||||||
Tensor* mOutputPtr = nullptr;
|
Tensor* mOutputPtr = nullptr;
|
||||||
id<MTLComputePipelineState> mBlitPipeline;
|
std::vector<id<MTLComputePipelineState>> mBlitPipeline;
|
||||||
std::vector<id<MTLBuffer>> mShapeTemp;
|
|
||||||
id<MTLBuffer> mZeroCopy = nil;
|
id<MTLBuffer> mZeroCopy = nil;
|
||||||
id<MTLComputePipelineState> mZeroPipeline;
|
id<MTLComputePipelineState> mZeroPipeline;
|
||||||
};
|
};
|
||||||
|
|
|
@ -34,6 +34,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re
|
||||||
info.stride[3] = sampler.src.offset;
|
info.stride[3] = sampler.src.offset;
|
||||||
info.extent[3] = sampler.dst.offset;
|
info.extent[3] = sampler.dst.offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string getUnitName(int bytes) {
|
||||||
|
std::string unitName;
|
||||||
|
switch (bytes) {
|
||||||
|
case 1:
|
||||||
|
unitName = "uchar";
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
unitName = "short";
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
unitName = "int";
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
unitName = "short4";
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
unitName = "int4";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
FUNC_PRINT(bytes);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return unitName;
|
||||||
|
}
|
||||||
|
|
||||||
static const char* gMultiBlitMetal = R"metal(
|
static const char* gMultiBlitMetal = R"metal(
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
|
@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]],
|
||||||
}
|
}
|
||||||
)metal";
|
)metal";
|
||||||
|
|
||||||
|
static const char* gMultiRasterTemplate = R"metal(
|
||||||
|
#include <metal_stdlib>
|
||||||
|
#include <simd/simd.h>
|
||||||
|
using namespace metal;
|
||||||
|
struct SamplerInfo {
|
||||||
|
uint4 stride;//stride[3] + offset
|
||||||
|
uint4 size;//size[3] + totalSize
|
||||||
|
uint4 extent;//dstStride[3]+dstOffset
|
||||||
|
};
|
||||||
|
kernel void main0(const device T *in [[buffer(0)]],
|
||||||
|
device T *out [[buffer(1)]],
|
||||||
|
const device uint4* buf [[buffer(2)]],
|
||||||
|
uint3 tgid [[thread_position_in_grid]]) {
|
||||||
|
|
||||||
|
uint4 limit = buf[2];
|
||||||
|
const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3);
|
||||||
|
uint3 gid = tgid;
|
||||||
|
gid.x = tgid.x % limit.x;
|
||||||
|
uint n = tgid.x / limit.x;
|
||||||
|
if (n < limit.y) {
|
||||||
|
SamplerInfo info = infoP[n];
|
||||||
|
|
||||||
|
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
|
||||||
|
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
|
||||||
|
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
|
||||||
|
#ifdef INPUT_FORMAT_NCHW
|
||||||
|
int srcOffsetReal = srcOffset;
|
||||||
|
#elif INPUT_FORMAT_NHWC
|
||||||
|
int srcOffsetReal = srcOffset;
|
||||||
|
#elif INPUT_FORMAT_C4NHW4
|
||||||
|
uint4 src_shape = buf[0];//src nchw
|
||||||
|
int src_batch = src_shape.x;
|
||||||
|
int src_channel = src_shape.y;
|
||||||
|
int src_height = src_shape.z;
|
||||||
|
int src_width = src_shape.w;
|
||||||
|
int in_w = srcOffset % src_width; srcOffset /= src_width;
|
||||||
|
int in_h = srcOffset % src_height; srcOffset /= src_height;
|
||||||
|
int in_c = srcOffset % src_channel;
|
||||||
|
int in_b = srcOffset / src_channel;
|
||||||
|
int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef OUTPUT_FORMAT_NCHW
|
||||||
|
int dstOffsetReal = dstOffset;
|
||||||
|
#elif OUTPUT_FORMAT_NHWC
|
||||||
|
int dstOffsetReal = dstOffset;
|
||||||
|
#elif OUTPUT_FORMAT_C4NHW4
|
||||||
|
uint4 dst_shape = buf[1];//dst nchw
|
||||||
|
int dst_batch = dst_shape.x;
|
||||||
|
int dst_channel = dst_shape.y;
|
||||||
|
int dst_height = dst_shape.z;
|
||||||
|
int dst_width = dst_shape.w;
|
||||||
|
int out_w = dstOffset % dst_width; dstOffset /= dst_width;
|
||||||
|
int out_h = dstOffset % dst_height; dstOffset /= dst_height;
|
||||||
|
int out_c = dstOffset % dst_channel;
|
||||||
|
int out_b = dstOffset / dst_channel;
|
||||||
|
int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
|
||||||
|
#endif
|
||||||
|
out[dstOffsetReal] = in[srcOffsetReal];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)metal";
|
||||||
|
|
||||||
|
static const char* gSingleRasterTemplate = R"metal(
|
||||||
|
#include <metal_stdlib>
|
||||||
|
#include <simd/simd.h>
|
||||||
|
using namespace metal;
|
||||||
|
struct SamplerInfo {
|
||||||
|
uint4 stride;//stride[3] + offset
|
||||||
|
uint4 size;//size[3] + totalSize
|
||||||
|
uint4 extent;//dstStride[3]+dstOffset
|
||||||
|
};
|
||||||
|
kernel void main0(const device T *in [[buffer(0)]],
|
||||||
|
device T *out [[buffer(1)]],
|
||||||
|
const device uint4* buf [[buffer(2)]],
|
||||||
|
uint3 gid [[thread_position_in_grid]]) {
|
||||||
|
SamplerInfo info = *((const device SamplerInfo*)(buf + 3));
|
||||||
|
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
|
||||||
|
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
|
||||||
|
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
|
||||||
|
#ifdef INPUT_FORMAT_NCHW
|
||||||
|
int srcOffsetReal = srcOffset;
|
||||||
|
#elif INPUT_FORMAT_NHWC
|
||||||
|
int srcOffsetReal = srcOffset;
|
||||||
|
#elif INPUT_FORMAT_C4NHW4
|
||||||
|
uint4 src_shape = buf[0];//src nchw
|
||||||
|
int src_batch = src_shape.x;
|
||||||
|
int src_channel = src_shape.y;
|
||||||
|
int src_height = src_shape.z;
|
||||||
|
int src_width = src_shape.w;
|
||||||
|
int in_w = srcOffset % src_width; srcOffset /= src_width;
|
||||||
|
int in_h = srcOffset % src_height; srcOffset /= src_height;
|
||||||
|
int in_c = srcOffset % src_channel;
|
||||||
|
int in_b = srcOffset / src_channel;
|
||||||
|
int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef OUTPUT_FORMAT_NCHW
|
||||||
|
int dstOffsetReal = dstOffset;
|
||||||
|
#elif OUTPUT_FORMAT_NHWC
|
||||||
|
int dstOffsetReal = dstOffset;
|
||||||
|
#elif OUTPUT_FORMAT_C4NHW4
|
||||||
|
uint4 dst_shape = buf[1];//dst nchw
|
||||||
|
int dst_batch = dst_shape.x;
|
||||||
|
int dst_channel = dst_shape.y;
|
||||||
|
int dst_height = dst_shape.z;
|
||||||
|
int dst_width = dst_shape.w;
|
||||||
|
int out_w = dstOffset % dst_width; dstOffset /= dst_width;
|
||||||
|
int out_h = dstOffset % dst_height; dstOffset /= dst_height;
|
||||||
|
int out_c = dstOffset % dst_channel;
|
||||||
|
int out_b = dstOffset / dst_channel;
|
||||||
|
int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
|
||||||
|
#endif
|
||||||
|
out[dstOffsetReal] = in[srcOffsetReal];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)metal";
|
||||||
|
|
||||||
static const char* gFillInt4 = R"metal(
|
static const char* gFillInt4 = R"metal(
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
#include <simd/simd.h>
|
#include <simd/simd.h>
|
||||||
|
@ -105,32 +249,13 @@ kernel void main0(device int4 *out [[buffer(0)]],
|
||||||
id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
|
id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
|
||||||
auto mtbn = static_cast<MetalBackend*>(backend);
|
auto mtbn = static_cast<MetalBackend*>(backend);
|
||||||
std::string pipelineName;
|
std::string pipelineName;
|
||||||
std::string unitName;
|
std::string unitName = getUnitName(bytes);
|
||||||
if (multiRegion) {
|
if (multiRegion) {
|
||||||
pipelineName = "blit_multi";
|
pipelineName = "blit_multi";
|
||||||
} else {
|
} else {
|
||||||
pipelineName = "blit";
|
pipelineName = "blit";
|
||||||
}
|
}
|
||||||
switch (bytes) {
|
|
||||||
case 1:
|
|
||||||
unitName = "uchar";
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
unitName = "short";
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
unitName = "int";
|
|
||||||
break;
|
|
||||||
case 8:
|
|
||||||
unitName = "short4";
|
|
||||||
break;
|
|
||||||
case 16:
|
|
||||||
unitName = "int4";
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
FUNC_PRINT(bytes);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
std::vector<std::string> keys = {
|
std::vector<std::string> keys = {
|
||||||
unitName,
|
unitName,
|
||||||
pipelineName
|
pipelineName
|
||||||
|
@ -159,9 +284,6 @@ MetalRaster::~MetalRaster() {
|
||||||
if (nil != mZeroCopy) {
|
if (nil != mZeroCopy) {
|
||||||
mtbn->returnConstBuffer(mZeroCopy);
|
mtbn->returnConstBuffer(mZeroCopy);
|
||||||
}
|
}
|
||||||
for (auto b : mShapeTemp) {
|
|
||||||
mtbn->returnConstBuffer(b);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
struct MemsetInfo {
|
struct MemsetInfo {
|
||||||
int value[4];
|
int value[4];
|
||||||
|
@ -197,9 +319,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
|
mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mTempInput.clear();
|
|
||||||
mTempInputCopy.clear();
|
mTempInputCopy.clear();
|
||||||
mTempOutput = nullptr;
|
|
||||||
mOutputPtr = output;
|
mOutputPtr = output;
|
||||||
#ifndef MNN_METAL_FORBID_RASTER_C4
|
#ifndef MNN_METAL_FORBID_RASTER_C4
|
||||||
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||||
|
@ -216,7 +337,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (fast) {
|
if (fast) {
|
||||||
mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true);
|
mBlitPipeline.resize(1);
|
||||||
|
mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true);
|
||||||
std::map<Tensor*, std::vector<int>> collectForTensor;
|
std::map<Tensor*, std::vector<int>> collectForTensor;
|
||||||
for (int i=0; i< des->regions.size(); ++i) {
|
for (int i=0; i< des->regions.size(); ++i) {
|
||||||
auto& slice = des->regions[i];
|
auto& slice = des->regions[i];
|
||||||
|
@ -249,7 +371,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
}
|
}
|
||||||
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
|
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
|
||||||
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
|
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
|
||||||
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
|
auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
|
||||||
blit.global = local.first;
|
blit.global = local.first;
|
||||||
blit.local = local.second;
|
blit.local = local.second;
|
||||||
mTempInputCopy.insert(std::make_pair(iter.first, blit));
|
mTempInputCopy.insert(std::make_pair(iter.first, blit));
|
||||||
|
@ -258,57 +380,14 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
for (int i=0; i< des->regions.size(); ++i) {
|
|
||||||
auto& slice = des->regions[i];
|
|
||||||
auto origin = slice.origin;
|
|
||||||
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (mTempInput.find(origin)!=mTempInput.end()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
std::shared_ptr<Tensor> newTensor(new Tensor);
|
|
||||||
TensorUtils::copyShape(origin, newTensor.get());
|
|
||||||
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
|
|
||||||
newTensor->buffer().type = origin->getType();
|
|
||||||
TensorUtils::setLinearLayout(newTensor.get());
|
|
||||||
mTempInput.insert(std::make_pair(origin, newTensor));
|
|
||||||
}
|
|
||||||
if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
|
|
||||||
mTempOutput.reset(new Tensor);
|
|
||||||
TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
|
|
||||||
}
|
|
||||||
if (nullptr != mTempOutput) {
|
|
||||||
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
|
||||||
if (!res) {
|
|
||||||
return OUT_OF_MEMORY;
|
|
||||||
}
|
|
||||||
mOutputPtr = mTempOutput.get();
|
|
||||||
}
|
|
||||||
for (auto& iter : mTempInput) {
|
|
||||||
auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
|
|
||||||
if (!res) {
|
|
||||||
return OUT_OF_MEMORY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto& iter : mTempInput) {
|
|
||||||
backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
|
|
||||||
}
|
|
||||||
if (nullptr != mTempOutput) {
|
|
||||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
|
||||||
}
|
|
||||||
mBlitPipeline = getBlitPipeline(bytes, backend(), true);
|
|
||||||
std::map<Tensor*, std::vector<int>> collectForTensor;
|
std::map<Tensor*, std::vector<int>> collectForTensor;
|
||||||
for (int i=0; i< des->regions.size(); ++i) {
|
for (int i=0; i< des->regions.size(); ++i) {
|
||||||
auto& slice = des->regions[i];
|
auto& slice = des->regions[i];
|
||||||
if (nullptr == slice.origin) {
|
if (nullptr == slice.origin) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto iter = mTempInput.find(slice.origin);
|
|
||||||
Tensor* t = slice.origin;
|
Tensor* t = slice.origin;
|
||||||
if (iter != mTempInput.end()) {
|
|
||||||
t = iter->second.get();
|
|
||||||
}
|
|
||||||
auto coliter = collectForTensor.find(t);
|
auto coliter = collectForTensor.find(t);
|
||||||
if (coliter == collectForTensor.end()) {
|
if (coliter == collectForTensor.end()) {
|
||||||
collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
|
collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
|
||||||
|
@ -316,15 +395,64 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
coliter->second.emplace_back(i);
|
coliter->second.emplace_back(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NSString* input_format;
|
||||||
|
NSString* output_format;
|
||||||
|
if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
|
||||||
|
output_format = @"OUTPUT_FORMAT_NCHW";
|
||||||
|
} else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
|
||||||
|
output_format = @"OUTPUT_FORMAT_NHWC";
|
||||||
|
} else {
|
||||||
|
output_format = @"OUTPUT_FORMAT_C4NHW4";
|
||||||
|
}
|
||||||
|
std::string unitName = getUnitName(bytes);
|
||||||
|
mBlitPipeline.resize(collectForTensor.size());
|
||||||
|
int index = 0;
|
||||||
for (auto& iter : collectForTensor) {
|
for (auto& iter : collectForTensor) {
|
||||||
|
auto origin = iter.first;
|
||||||
|
|
||||||
|
if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
|
||||||
|
input_format = @"INPUT_FORMAT_NCHW";
|
||||||
|
} else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
|
||||||
|
input_format = @"INPUT_FORMAT_NHWC";
|
||||||
|
} else {
|
||||||
|
input_format = @"INPUT_FORMAT_C4NHW4";
|
||||||
|
}
|
||||||
|
std::vector<std::string> keys = {
|
||||||
|
std::string([input_format UTF8String]),
|
||||||
|
std::string([output_format UTF8String]),
|
||||||
|
unitName,
|
||||||
|
};
|
||||||
|
if(iter.second.size() == 1) {
|
||||||
|
keys.emplace_back("direct_raster_single");
|
||||||
|
} else {
|
||||||
|
keys.emplace_back("direct_raster_multi");
|
||||||
|
}
|
||||||
|
auto pipeline = mtbn->runtime()->findPipeline(keys);
|
||||||
|
|
||||||
|
if(nullptr == pipeline) {
|
||||||
|
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
|
||||||
|
options.preprocessorMacros = @{
|
||||||
|
input_format : @"1",
|
||||||
|
output_format : @"1",
|
||||||
|
@"T" : @(unitName.c_str()),
|
||||||
|
};
|
||||||
|
if(iter.second.size() == 1) {
|
||||||
|
pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options);
|
||||||
|
} else {
|
||||||
|
pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options);
|
||||||
|
}
|
||||||
|
mtbn->runtime()->insertPipeline(keys, pipeline);
|
||||||
|
}
|
||||||
|
mBlitPipeline[index] = pipeline;
|
||||||
|
|
||||||
BlitInfo blit;
|
BlitInfo blit;
|
||||||
auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t));
|
auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t));
|
||||||
blit.blit = std::make_pair(memory.first, memory.second);
|
blit.blit = std::make_pair(memory.first, memory.second);
|
||||||
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();
|
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();
|
||||||
|
|
||||||
auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second);
|
auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second);
|
||||||
|
|
||||||
blit.blit = std::make_pair(memory.first, memory.second);
|
|
||||||
uint32_t maxSize[3] = {1, 1, 1};
|
uint32_t maxSize[3] = {1, 1, 1};
|
||||||
for (int v=0; v<iter.second.size(); ++v) {
|
for (int v=0; v<iter.second.size(); ++v) {
|
||||||
auto& slice = des->regions[iter.second[v]];
|
auto& slice = des->regions[iter.second[v]];
|
||||||
|
@ -333,41 +461,42 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
|
||||||
maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
|
maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
|
||||||
maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
|
maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
|
||||||
}
|
}
|
||||||
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
|
|
||||||
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
|
uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second);
|
||||||
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
|
int origin_area = 1;
|
||||||
|
for(int i = 2; i < origin->shape().size(); i++) {
|
||||||
|
origin_area *= origin->shape()[i];
|
||||||
|
}
|
||||||
|
int output_area = 1;
|
||||||
|
for(int i = 2; i < output->shape().size(); i++) {
|
||||||
|
output_area *= output->shape()[i];
|
||||||
|
}
|
||||||
|
shape[0] = ALIMAX(1, origin->shape()[0]);
|
||||||
|
shape[1] = ALIMAX(1, origin->shape()[1]);
|
||||||
|
shape[2] = ALIMAX(1, origin_area);
|
||||||
|
shape[3] = 1;
|
||||||
|
shape[4] = ALIMAX(1, output->shape()[0]);
|
||||||
|
shape[5] = ALIMAX(1, output->shape()[1]);
|
||||||
|
shape[6] = ALIMAX(1, output_area);
|
||||||
|
shape[7] = 1;
|
||||||
|
shape[8] = maxSize[0];
|
||||||
|
shape[9] = iter.second.size();
|
||||||
|
|
||||||
|
auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
|
||||||
blit.global = local.first;
|
blit.global = local.first;
|
||||||
blit.local = local.second;
|
blit.local = local.second;
|
||||||
mTempInputCopy.insert(std::make_pair(iter.first, blit));
|
mTempInputCopy.insert(std::make_pair(iter.first, blit));
|
||||||
}
|
}
|
||||||
for (auto b : mShapeTemp) {
|
|
||||||
mtbn->returnConstBuffer(b);
|
|
||||||
}
|
|
||||||
mShapeTemp.clear();
|
|
||||||
for (int i = 0; i < mTempInput.size(); ++i) {
|
|
||||||
id<MTLBuffer> shape = mtbn->getConstBuffer(0);
|
|
||||||
mShapeTemp.emplace_back(std::move(shape));
|
|
||||||
}
|
|
||||||
if (nullptr != mTempOutput) {
|
|
||||||
mShapeTemp.emplace_back(mtbn->getConstBuffer(0));
|
|
||||||
}
|
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
|
||||||
|
|
||||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||||
auto context = (__bridge MNNMetalContext *)backend->context();
|
auto context = (__bridge MNNMetalContext *)backend->context();
|
||||||
int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset;
|
|
||||||
if (nullptr != mTempOutput) {
|
|
||||||
out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset;
|
|
||||||
}
|
|
||||||
if (mNeedZero) {
|
if (mNeedZero) {
|
||||||
size_t sizeInBytes;
|
size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
|
||||||
if (mTempOutput != nullptr) {
|
|
||||||
sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get());
|
|
||||||
} else {
|
|
||||||
sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
|
|
||||||
}
|
|
||||||
size_t size = sizeInBytes / (4 * sizeof(int32_t));
|
size_t size = sizeInBytes / (4 * sizeof(int32_t));
|
||||||
auto ptr = (MemsetInfo*)[mZeroCopy contents];
|
auto ptr = (MemsetInfo*)[mZeroCopy contents];
|
||||||
ptr->size[0] = (uint32_t)size;
|
ptr->size[0] = (uint32_t)size;
|
||||||
|
@ -376,28 +505,33 @@ void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vecto
|
||||||
[encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
|
[encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool singlePipeline = false;
|
||||||
int index = 0;
|
int index = 0;
|
||||||
for (auto& iter : mTempInput) {
|
if(mBlitPipeline.size() == 1) {
|
||||||
backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
|
singlePipeline = true;
|
||||||
|
[encoder setComputePipelineState:mBlitPipeline[0]];
|
||||||
|
} else {
|
||||||
|
MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setComputePipelineState:mBlitPipeline];
|
|
||||||
for (auto& iter : mTempInputCopy) {
|
for (auto& iter : mTempInputCopy) {
|
||||||
|
if(!singlePipeline) {
|
||||||
|
[encoder setComputePipelineState:mBlitPipeline[index++]];
|
||||||
|
}
|
||||||
MetalBackend::setTensor(iter.first, encoder, 0);
|
MetalBackend::setTensor(iter.first, encoder, 0);
|
||||||
MetalBackend::setTensor(mOutputPtr, encoder, 1);
|
MetalBackend::setTensor(mOutputPtr, encoder, 1);
|
||||||
auto& blit = iter.second;
|
auto& blit = iter.second;
|
||||||
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
|
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
|
||||||
[encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
|
[encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
|
[encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
|
||||||
}
|
}
|
||||||
if (nullptr != mTempOutput) {
|
|
||||||
backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class MetalRasterCreator : public MetalBackend::Creator {
|
class MetalRasterCreator : public MetalBackend::Creator {
|
||||||
public:
|
public:
|
||||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
|
||||||
|
|
||||||
return new MetalRaster(backend);
|
return new MetalRaster(backend);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in [[buffer(0)]],
|
||||||
//if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
|
//if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void conv1x1_g1z4_m1w4(const device ftype4 *in [[buffer(0)]],
|
||||||
|
device ftype4 *out [[buffer(1)]],
|
||||||
|
constant conv1x1_constants& cst [[buffer(2)]],
|
||||||
|
const device MNN::uchar4x2 *wt [[buffer(3)]],
|
||||||
|
const device ftype4 *biasTerms [[buffer(4)]],
|
||||||
|
const device float4 *dequantScale [[buffer(5)]],
|
||||||
|
uint3 gid[[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
int uz = gid.x * 2 + sgitg;
|
||||||
|
|
||||||
|
int rx = gid.y;
|
||||||
|
auto xy_wt = wt + uz * cst.input_slice;
|
||||||
|
auto xy_in0 = in + (int)gid.z * cst.input_size + rx + 0;
|
||||||
|
auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx;
|
||||||
|
auto biasValue = FLOAT4(biasTerms[uz]);
|
||||||
|
FLOAT4 result0 = FLOAT4(0);
|
||||||
|
|
||||||
|
int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
|
||||||
|
for (int bi=0; bi<cst.block_size; bi++) {
|
||||||
|
FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
|
||||||
|
FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
|
||||||
|
int zmin = bi * block;
|
||||||
|
int zmax = min(zmin + block, cst.input_slice);
|
||||||
|
for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
|
||||||
|
auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
|
||||||
|
MNN::uchar4x2 w_int4 = xy_wt[z];
|
||||||
|
|
||||||
|
FLOAT4x4 w_dequant;
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
|
||||||
|
FLOAT4 res = w4 * scale[i] + dequant_bias[i];
|
||||||
|
w_dequant[i] = res;
|
||||||
|
}
|
||||||
|
|
||||||
|
result0 += FLOAT4(in40 * w_dequant);
|
||||||
|
|
||||||
|
// FLOAT4x4 w_dequant;
|
||||||
|
// for (int i = 0; i < 4; ++i) {
|
||||||
|
// FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
|
||||||
|
// FLOAT4 res = w4 * scale[i] + dequant_bias[i];
|
||||||
|
// w_dequant[i] = w4;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// FLOAT4 temp = FLOAT4(in40 * w_dequant);
|
||||||
|
// result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
FLOAT4 res;
|
||||||
|
res.x = simd_sum(result0.x);
|
||||||
|
res.y = simd_sum(result0.y);
|
||||||
|
res.z = simd_sum(result0.z);
|
||||||
|
res.w = simd_sum(result0.w);
|
||||||
|
/* true */
|
||||||
|
if (tiisg == 0) {
|
||||||
|
xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel void conv1x1_g1z8(const device ftype4 *in [[buffer(0)]],
|
kernel void conv1x1_g1z8(const device ftype4 *in [[buffer(0)]],
|
||||||
device ftype4 *out [[buffer(1)]],
|
device ftype4 *out [[buffer(1)]],
|
||||||
constant conv1x1_constants& cst [[buffer(2)]],
|
constant conv1x1_constants& cst [[buffer(2)]],
|
||||||
|
|
|
@ -5,6 +5,7 @@ using namespace metal;
|
||||||
// Macro
|
// Macro
|
||||||
// –––––––––––––––––––––––––––––––––––––––––––––––––––
|
// –––––––––––––––––––––––––––––––––––––––––––––––––––
|
||||||
|
|
||||||
|
#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32
|
||||||
#define UP_DIV(x, y) ( ((x) + (y) - 1) / (y) )
|
#define UP_DIV(x, y) ( ((x) + (y) - 1) / (y) )
|
||||||
#define ROUND_UP(x, y) ( ((x) + (y) - 1) / (y) * (y) )
|
#define ROUND_UP(x, y) ( ((x) + (y) - 1) / (y) * (y) )
|
||||||
|
|
||||||
|
|
|
@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in [[buffer(0)]],
|
||||||
out_data[gid.x] = (ftype4)(norm);
|
out_data[gid.x] = (ftype4)(norm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel void layernorm_m1x4_rms(const device ftype4 *in [[buffer(0)]],
|
||||||
|
device ftype4 *out [[buffer(1)]],
|
||||||
|
constant layernorm_constants& cst [[buffer(2)]],
|
||||||
|
const device float4 *gamma [[buffer(3)]],
|
||||||
|
const device float4 *beta [[buffer(4)]],
|
||||||
|
uint gid [[threadgroup_position_in_grid]],
|
||||||
|
uint tiisg[[thread_index_in_simdgroup]],
|
||||||
|
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||||
|
|
||||||
|
int total_idx = (gid * 4 + sgitg);
|
||||||
|
int in_idx = total_idx % (cst.inside/4);
|
||||||
|
int out_idx = total_idx / (cst.inside/4);
|
||||||
|
|
||||||
|
auto in_data = in + out_idx * cst.inside/4;
|
||||||
|
auto out_data = out + out_idx * cst.inside/4;
|
||||||
|
|
||||||
|
float square_sum = 0.0f;
|
||||||
|
|
||||||
|
for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
|
||||||
|
ftype4 data = in_data[i];
|
||||||
|
float dis = data.x;
|
||||||
|
square_sum += dis * dis;
|
||||||
|
dis = data.y;
|
||||||
|
square_sum += dis * dis;
|
||||||
|
dis = data.z;
|
||||||
|
square_sum += dis * dis;
|
||||||
|
dis = data.w;
|
||||||
|
square_sum += dis * dis;
|
||||||
|
}
|
||||||
|
square_sum = simd_sum(square_sum);
|
||||||
|
|
||||||
|
if(tiisg == 0) {
|
||||||
|
float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
|
||||||
|
|
||||||
|
float4 norm = var * ((float4)in_data[in_idx]);
|
||||||
|
if(cst.has_gamma_beta) {
|
||||||
|
out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]);
|
||||||
|
} else {
|
||||||
|
out_data[in_idx] = (ftype4)(norm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (deviceName.find("QUALCOMM Adreno") != std::string::npos) {
|
if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
|
||||||
mGpuType = ADRENO;
|
mGpuType = ADRENO;
|
||||||
|
|
||||||
// if device is QUALCOMM's and version is 2.0 , set spacial optimized param
|
// if device is QUALCOMM's and version is 2.0 , set spacial optimized param
|
||||||
|
|
|
@ -7,7 +7,8 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
|
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
|
||||||
#ifdef WIN32
|
#ifdef _WIN32
|
||||||
|
#include <windows.h>
|
||||||
#include <libloaderapi.h>
|
#include <libloaderapi.h>
|
||||||
#else
|
#else
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
|
@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {
|
||||||
|
|
||||||
bool OpenCLSymbols::UnLoadOpenCLLibrary() {
|
bool OpenCLSymbols::UnLoadOpenCLLibrary() {
|
||||||
if (handle_ != nullptr) {
|
if (handle_ != nullptr) {
|
||||||
#if defined(WIN32)
|
#if defined(_WIN32)
|
||||||
if (FreeLibrary(handle_) == 0) {
|
if (FreeLibrary(handle_) == 0) {
|
||||||
#else
|
#else
|
||||||
if (dlclose(handle_) != 0) {
|
if (dlclose(handle_) != 0) {
|
||||||
|
@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() {
|
||||||
|
|
||||||
|
|
||||||
bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
|
bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
|
||||||
#if defined(WIN32)
|
#if defined(_WIN32)
|
||||||
handle_ = LoadLibraryA(library_path.c_str());
|
handle_ = LoadLibraryA(library_path.c_str());
|
||||||
if (handle_ == nullptr) {
|
if (handle_ == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue