MNN:Sync: Sync Internal 2.9.6

This commit is contained in:
xiaying 2024-10-14 19:26:28 +08:00
parent f830294eef
commit 860fceb3ab
147 changed files with 6036 additions and 2814 deletions

View File

@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH
${CMAKE_MODULE_PATH}
"${CMAKE_CURRENT_LIST_DIR}/cmake"
)
if(WIN32)
if(NOT MSVC)
set(CMAKE_MSVC_RUNTIME_LIBRARY "")
set(MSVC_RUNTIME_LIBRARY "")
endif()
endif()
# build options
option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF)
option(MNN_OPENCL "Enable OpenCL" OFF)
option(MNN_OPENGL "Enable OpenGL" OFF)
option(MNN_VULKAN "Enable Vulkan" OFF)
option(MNN_ARM82 "Enable ARM82" OFF)
option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
option(MNN_ONEDNN "Enable oneDNN" OFF)
option(MNN_AVX512 "Enable AVX512" OFF)
option(MNN_CUDA "Enable CUDA" OFF)
@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "")
# Add Thread dependency
find_package(Threads)
list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
if(WIN32)
if(NOT MSVC)
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
endif()
endif()
if (NOT APPLE)
if(MNN_OPENMP)

36
MNN.sln
View File

@ -1,36 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.5.002.0
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU
{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED}
{797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}
{E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6}
EndGlobalSection
EndGlobal

View File

@ -1,77 +0,0 @@
//
// NEON_MNNConvRunForUnitDepthWise_BF16.S
// MNN
//
// Created by MNN on 2021/03/09.
// Copyright © 2018-2021 Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function NEON_MNNConvRunForUnitDepthWise_BF16
//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
//Auto: r0:dst, r1:src, r2:weight, r3:fw
push {r4-r8, lr}
//Load from sp:
//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
mov r4, r3
ldr r5, [sp, #24]
ldr r6, [sp, #28]
ldr r7, [sp, #32]
ldr r8, [sp, #36]
cmp r4, #0
vmov.i32 q0, #0
beq UnitEnd
cmp r5, #0
beq UnitEnd
mov lr, #2
mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
mul lr, r4, r7
sub r8, r8, lr
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
mov lr, #8
mul lr, r4, lr
sub r6, r6, lr
UnitLoopH:
mov lr, r4
UnitLoopW:
vld1.16 {d2}, [r1], r7
vld1.16 {d4}, [r2]!
vshll.s16 q1, d2, #16
vshll.s16 q2, d4, #16
vmla.f32 q0, q1, q2
subs lr, lr, #1
bne UnitLoopW
subs r5, r5, #1
add r1, r1, r8
add r2, r2, r6
bne UnitLoopH
UnitEnd:
vshrn.i32 d0, q0, #16
vst1.16 {d0}, [r0]
pop {r4-r8, pc}
#endif
#endif

View File

@ -1,66 +0,0 @@
//
// NEON_MNNConvRunForUnitDepthWise_BF16.S
// MNN
//
// Created by MNN on 2021/03/09.
// Copyright © 2018-2021 Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function NEON_MNNConvRunForUnitDepthWise_BF16
//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
//Auto: x0:dst, x1:src, x2:weight, x3:fw
//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
cmp x3, #0
movi v0.4s, #0
beq UnitEnd
cmp x4, #0
beq UnitEnd
mov x9, #2
mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
mul x9, x3, x6
sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
mov x9, #8
mul x9, x3, x9
sub x5, x5, x9
UnitLoopH:
mov x9, x3
UnitLoopW:
ld1 {v1.4h}, [x1], x6
ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
shll v1.4s, v1.4h, #16
shll v2.4s, v2.4h, #16
fmla v0.4s, v1.4s, v2.4s
subs x9, x9, #1
bne UnitLoopW
subs x4, x4, #1
add x1, x1, x7
add x2, x2, x5
bne UnitLoopH
UnitEnd:
shrn v0.4h, v0.4s, #16
st1 {v0.4h}, [x0]
ret
#endif

View File

@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
}
}
static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
BFVec4 dstValue(0.0f);
const int16_t* src_z = (const int16_t*)src;
const int16_t* weight_z = (const int16_t*)weight;
for (fy = 0; fy < fh; ++fy) {
const auto src_y = src_z + fy * dilateY_step;
const auto weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
const auto weight_x = weight_y + 4 * fx;
const auto src_x = src_y + fx * dilateX_step;
dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
}
}
BFVec4::save((int16_t*)dst, dstValue);
}
static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr;
bool BF16Functions::init() {
gInstance = new CoreFunctions;
gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
@ -890,7 +872,6 @@ bool BF16Functions::init() {
gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
#ifdef __aarch64__
cpuinfo_arm_isa gCPUInfo;

View File

@ -38,7 +38,7 @@ MNN使用CMake构建项目CMake中的宏定义列表如下
| MNN_OPENCL | 是否构建`OpenCL`后端,默认为`OFF` |
| MNN_OPENGL | 是否构建`OpenGL`后端,默认为`OFF` |
| MNN_VULKAN | 是否构建`Vulkan`后端,默认为`OFF` |
| MNN_ARM82 | 是否构建`Armv8.2`后端,默认为`OFF` |
| MNN_ARM82 | 编译ARM架构时是否构建`Armv8.2`后端以支持FP16计算默认为`ON` |
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
| MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` |
| MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` |

View File

@ -22,24 +22,14 @@
```bash
mkdir build && cd build && cmake .. && make -j8
```
## Windows
## Windows(非ARM架构)
- 环境要求
- Microsoft Visual Studio >= 2017
- cmake >= 3.13
- powershell
- Ninja
- 相关编译选项
- 同`Linux/MacOS`
- 具体步骤
1. opencl/vulkan
- *(可选)*下载GPU Caps Viewer你可以通过这个工具来查看本机设备的详细信息opencl、opengl、vulkan等
- sdk和驱动准备
- [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases)将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量
- [vulkan sdk](https://vulkan.lunarg.com/)将vulkan skd路径加入VULKAN_SDK环境变量以备cmake查找
- [AMD opencl驱动](https://www.amd.com/zh-hans/support)
- [NVIDIA opencl驱动](https://developer.nvidia.com/opencl)
- [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api)
2. 编译
- 64位编译在设置中找到vcvars64.bat适用于 VS 2017 的 x64 本机工具命令提示并单击打开VS编译x64架构程序的虚拟环境
- 32位编译在设置中找到vcvarsamd64_x86.batVS 2017的 x64_x86 交叉工具命令提示符并单击打开VS交叉编译x86架构程序的虚拟环境
- 在虚拟环境中执行如下编译命令:
@ -53,6 +43,24 @@
- 若需要编译模型转换工具cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
- 若需要编译 MNN CUDAMNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
- Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题)
## Windows(ARM架构)
- 环境要求
- Microsoft Visual Studio >= 2017
- cmake >= 3.13
- Ninja
- Clang
- Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1
- 相关编译选项
- 同`Linux/MacOS`
- 具体步骤
- 打开vs的ARM64命令行工具
- 进入 MNN 根目录
- mkdir build && cd build
- cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe"  -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release
- Visual Studio 安装路径不一致的,可自行修改脚本
- ninja -j16
## Android
- 环境要求
- cmake >= 3.10

View File

@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress请查
| ADMM | 使用ADMM方法进行权值量化 |
## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
| input_type | `str` | 输入数据的类型,"sequence" |
| path | `str` | 存放校正特征量化系数的输入数据目录 |例如该目录下包含2个输入数据集input_0和input_1子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹可使用 testMNNFromOnnx.py 等脚本生成,参考模型转换的正确性校验部分。
| 需要特别指定的参数 | 设置值 |
|--------------------|------|
| input_type | `str`:输入数据的类型,"sequence" |
| path | `str`:存放校正特征量化系数的输入数据目录 |
例如在quant.json文件中 "path": "/home/data/inputs_dir/"你所构造的矫正数据集有两个分别存放在input_0和input_1子目录下即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称例如该模型的输入有三个data0, data1, data2输出有两个out1, out2. 那么在input_0和input_1子目录下分别有六个文件data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应最后一个input.json文件则描述的是输入名和对应的shape内容
```json
{
"inputs": [
{
"name": "data0",
"shape": [
2,
4,
64,
64
]
},
{
"name": "data1",
"shape": [
1
]
},
{
"name": "data2",
"shape": [
2,
512,
768
]
}
],
"outputs": [
"out1", "out2"
]
}
```
## 量化模型的使用
和浮点模型同样使用方法,输入输出仍然为浮点类型

View File

@ -40,13 +40,16 @@ python llmexport.py \
├── llm.mnn
├── llm.mnn.json
├── llm.mnn.weight
├── llm.onnx
├── onnx/
├──llm.onnx
├──llm.onnx.data
├── llm_config.json
└── tokenizer.txt
```
### 功能
- 支持将模型为onnx或mnn模型使用`--export onnx`或`--export mnn`
- 将模型先转为onnx模型使用`--export onnx`,然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp
- 更快的方式直接转为mnn模型使用`--export mnn`注意你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具需保证该目录下存在MNNConvert文件。
- 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容
- 默认会使用onnx-slim对onnx模型进行优化跳过该步骤使用`--skip_slim`
- 支持合并lora权重后导出指定lora权重的目录使用`--lora_path`

View File

@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
ScheduleConfig sConfig;
sConfig.type = type;
type = Schedule::getApprociateType(sConfig);
auto creator = MNNGetExtraRuntimeCreator(type);
MNN_ASSERT(nullptr != creator);
Backend::Info info;
info.type = type;
info.mode = Backend::Info::DIRECT;
info.numThread = numberThread;
if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
info.numThread = 4;
}
mAttr->firstType = type;
auto firstIter = mRuntimes.find(mAttr->firstType);
if (firstIter == mRuntimes.end()) {
info.user = (BackendConfig*)&config;
std::shared_ptr<Runtime> bn(creator->onCreate(info));
mRuntimes[mAttr->firstType] = bn;
} else {
firstIter->second->onReset(numberThread, &config, true);
}
} else {
auto creator = MNNGetExtraRuntimeCreator(type);
if (nullptr == creator) {
MNN_ERROR("Error to find creator of %d, set CPU default\n", type);
auto rt = _getOrCreateRuntime(type, &config, numberThread);
if (rt == nullptr) {
type = MNN_FORWARD_CPU;
creator = MNNGetExtraRuntimeCreator(type);
numberThread = 1;
rt = _getOrCreateRuntime(type, &config, numberThread);
}
MNN_ASSERT(nullptr != creator);
Backend::Info info;
info.type = type;
MNN_ASSERT(nullptr != rt);
mAttr->firstType = type;
auto firstIter = mRuntimes.find(mAttr->firstType);
if (firstIter == mRuntimes.end()) {
info.mode = Backend::Info::DIRECT;
info.numThread = numberThread;
info.user = (BackendConfig*)&config;
std::shared_ptr<Runtime> bn(creator->onCreate(info));
mRuntimes[mAttr->firstType] = bn;
} else {
firstIter->second->onReset(numberThread, &config, true);
}
}
_refreshRuntime();
}
int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
}
std::shared_ptr<Runtime> Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) {
auto iter = mRuntimeInfo.first.find(type);
if (iter != mRuntimeInfo.first.end()) {
iter->second->onReset(numberThread, config, reset);
return iter->second;
}
// Create Backend
auto cre = MNNGetExtraRuntimeCreator(type);
if (nullptr == cre) {
return nullptr;
}
Backend::Info info;
info.type = type;
info.mode = Backend::Info::DIRECT;
info.numThread = numberThread;
info.user = (BackendConfig*)config;
std::shared_ptr<Runtime> rt(cre->onCreate(info));
if (nullptr != rt) {
mRuntimeInfo.first.insert(std::make_pair(type, rt));
}
return rt;
}
void Executor::gc(GCFlag flag) {
int level = flag == FULL ? 100 : 0;
for (auto& iter : mRuntimes) {
for (auto& iter : mRuntimeInfo.first) {
iter.second->onGabageCollect(level);
}
}
Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread) {
mRuntimes.insert(std::make_pair(type, backend));
Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int numberThread) {
mRuntimeInfo.first.insert(std::make_pair(type, runtime));
mAttr.reset(new ExecutorAttr);
mAttr->firstType = type;
if (MNN_FORWARD_CPU != type) {
// Create Backup Backend
Backend::Info info;
info.type = MNN_FORWARD_CPU;
auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
info.mode = Backend::Info::DIRECT;
info.numThread = 1;
std::shared_ptr<Runtime> backupRt(cre->onCreate(info));
mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt));
if (type == MNN_FORWARD_CPU) {
mRuntimeInfo.second = runtime;
} else {
mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1);
}
mDebug.reset(new DebugTools);
BackendConfig defaultConfig;
defaultConfig.flags = 4;
std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
std::shared_ptr<Backend> defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig));
mAttr->constantBackend = defaultBackend;
_refreshRuntime();
}
Executor::~Executor(){
// Do nothing
@ -176,21 +160,6 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
auto executor = new Executor(runtime, type, numberThread);
return std::shared_ptr<Executor>(executor);
}
void Executor::_refreshRuntime() {
mRuntimeInfo.first.clear();
mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
auto firstIter = mRuntimes.find(getAttr()->firstType);
if (firstIter != mRuntimes.end()) {
mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second));
} else {
MNN_ASSERT(false);
}
for (auto& iter : mRuntimes) {
if (iter.first != getAttr()->firstType) {
mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second));
}
}
}
RuntimeInfo Executor::getRuntime() {
auto glo = ExecutorScope::Current();
@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
auto res = new RuntimeManager;
auto glo = ExecutorScope::Current();
std::lock_guard<std::mutex> _l(glo->mMutex);
auto& originRt = glo->mRuntimes;
Backend::Info compute;
compute.type = Schedule::getApprociateType(config);
compute.numThread = config.numThread;
auto& originRt = glo->mRuntimeInfo;
auto type = Schedule::getApprociateType(config);
int numThread = config.numThread;
if(config.type == MNN_FORWARD_AUTO) {
if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) {
if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
// AUTO set default gpu-mode MNN_GPU_TUNING_FAST
compute.numThread = 16;
numThread = 16;
}
}
compute.user = config.backendConfig;
auto iter = originRt.find(compute.type);
if (iter == originRt.end()) {
auto creator = MNNGetExtraRuntimeCreator(compute.type);
if (nullptr == creator) {
return nullptr;
}
auto newBn = creator->onCreate(compute);
if (nullptr == newBn) {
MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type));
return nullptr;
}
originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
} else {
iter->second->onReset(compute.numThread, compute.user, false);
}
res->mInside->mRuntime.second = originRt[DEFAULT_BACKUP_RUNTIME_KEY];
res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
res->mInside->mInfo = originRt[compute.type];
res->mInside->mNumberThread = compute.numThread;
auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false);
res->mInside->mRuntime.second = originRt.second;
res->mInside->mRuntime.first.insert(std::make_pair(type, rt));
res->mInside->mInfo = rt;
res->mInside->mNumberThread = numThread;
if (nullptr != config.backendConfig) {
res->mInside->mConfig = *config.backendConfig;
res->mInside->mUserConfig = true;
} else {
res->mInside->mUserConfig = false;
}
glo->_refreshRuntime();
return res;
}
ExecutorAttr* Executor::getAttr() const {

View File

@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
if (net->extraInfo() && net->extraInfo()->version()) {
info->version = net->extraInfo()->version()->str();
}
if (net->bizCode()) {
info->bizCode = net->bizCode()->str();
}
auto rtMgr = _rtMgr;
Module::Config defaultConfig;
if (nullptr == config) {

View File

@ -598,6 +598,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
mSession->getInfo(Interpreter::FLOPS, &flops);
glo->getDebugTools()->flops += flops;
#endif
return outputs;
}

View File

@ -234,6 +234,8 @@ public:
// size limit of kvcache in memory (for a single layer)
// if the size of kvcache exceeds the limit, it will be moved to disk
KVCACHE_SIZE_LIMIT = 8,
// Op encoder number for commit
OP_ENCODER_NUMBER_FOR_COMMIT = 9,
};
enum ExternalPathType {

View File

@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 9
#define MNN_VERSION_PATCH 5
#define MNN_VERSION_PATCH 6
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */

View File

@ -138,12 +138,10 @@ public:
};
static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
private:
void _refreshRuntime();
std::shared_ptr<Runtime> _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true);
Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
// TODO: Remove mRuntimes, only use mRuntimeInfo
std::map<MNNForwardType, std::shared_ptr<Runtime>> mRuntimes;
RuntimeInfo mRuntimeInfo;
std::shared_ptr<DebugTools> mDebug;
std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;

View File

@ -93,6 +93,8 @@ public:
std::vector<std::string> outputNames;
// The MNNConvert's Version build the module
std::string version;
// The bizCode of MNN model
std::string bizCode;
};
const Info* getInfo() const;
class CloneContext {

View File

@ -158,8 +158,6 @@
4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
@ -497,7 +495,6 @@
92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -542,7 +539,6 @@
92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@ -603,12 +599,10 @@
92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
@ -790,6 +784,8 @@
CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@ -1005,8 +1001,6 @@
4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
@ -1353,7 +1347,6 @@
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1398,7 +1391,6 @@
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@ -1459,12 +1451,10 @@
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
@ -1647,6 +1637,8 @@
CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@ -2648,7 +2640,6 @@
92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2659,6 +2650,8 @@
92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
isa = PBXGroup;
children = (
CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */,
CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */,
95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
@ -2688,8 +2681,6 @@
4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
@ -2743,7 +2734,6 @@
92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@ -2795,12 +2785,10 @@
92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
@ -3036,7 +3024,6 @@
4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
@ -3394,14 +3381,12 @@
4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
@ -3483,6 +3468,7 @@
92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */,
4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
@ -3592,7 +3578,6 @@
4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
@ -3711,6 +3696,7 @@
4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
@ -3771,7 +3757,6 @@
48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
@ -3800,7 +3785,6 @@
92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,

View File

@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs):
memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
thread_num = kwargs.get('thread_num', 4)
thread_num = kwargs.get('thread_num', 1)
module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
backend, memory_mode, power_mode, precision_mode, thread_num)

View File

@ -13,6 +13,8 @@ try:
except:
mnn_logger = None
def convert(args):
Tools.mnnconvert(args)
def parse_args():
arg_dict = {}
@ -34,7 +36,7 @@ def parse_args():
def main():
""" main funcion """
Tools.mnnconvert(sys.argv)
convert(sys.argv)
arg_dict = parse_args()

View File

@ -17,6 +17,7 @@ sys.argv = [sys.argv[0]] + unknown
IS_WINDOWS = (platform.system() == 'Windows')
IS_DARWIN = (platform.system() == 'Darwin')
IS_LINUX = (platform.system() == 'Linux')
IS_ARM = ('arm' in platform.processor())
BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn
USE_TRT = False
@ -55,8 +56,8 @@ if len(sys.argv) > 1 and sys.argv[1] != None:
USE_OPENMP = True
if "llm" in sys.argv[1]:
USE_LLM = True
if "arm82" in sys.argv[1]:
USE_ARM82 = True
if IS_ARM: USE_ARM82 = True
print ("USE_INTERNAL:", USE_INTERNAL)
print ("USE_TRT:", USE_TRT)
@ -69,7 +70,6 @@ print ("USE_RENDER:", USE_RENDER)
print ("USE_SSE:", USE_SSE)
print ("USE_OPENMP:", USE_OPENMP)
print ("USE_LLM:", USE_LLM)
print ("USE_ARM82:", USE_ARM82)
def build_deps():
""" build depency """
@ -92,6 +92,9 @@ def build_deps():
if USE_ARM82:
extra_opts += ' -DMNN_ARM82=ON'
extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
if IS_DARWIN:
# Mac / iOS System use GCD instead of MNN's thread pool
extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON '
if IS_WINDOWS:
os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\

View File

@ -1,3 +1,4 @@
#include <sstream>
#include "llm/llm.hpp"
typedef struct {
@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
Py_RETURN_NONE;
}
MNN::Transformer::LlmStreamBuffer buffer(nullptr);
std::ostream null_os(&buffer);
std::ostringstream null_os;
auto res = self->llm->response(query, stream ? &std::cout : &null_os);
return string2Object(res);
}

View File

@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) {
}
auto res = PyDict_New();
PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str()));
{
auto names = PyList_New(info->inputNames.size());
for (int i=0; i<info->inputNames.size(); ++i) {
@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
}
for (auto i = 0; i < PySequence_Size(dicts); ++i) {
backendConfig[i].sharedContext = nullptr;
config[i].numThread = 1;
config[i].backendConfig = &backendConfig[i];
bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
if (!ret) {

View File

@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
#endif
#if defined(__aarch64__)
void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
#endif
void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float*
}
}
void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
constexpr int pack = 8;
int unit = ow / 2;
auto biasF = Vec::load((const float16_t*)bias);
auto minF = Vec(parameters[2]);
auto maxF = Vec(parameters[3]);
MNN_ASSERT(cacheLineSize >= 1);
for (int x = 0; x < unit; ++x) {
int offset = 4 * pack * x, i = 0;
Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
for (i = 1; i < cacheLineSize; ++i) {
m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
}
auto o0 = m0 + m1 + m2 + biasF;
auto o1 = m1 - m2 + m3 + biasF;
o0 = Vec::min(maxF, o0);
o1 = Vec::min(maxF, o1);
o0 = Vec::max(minF, o0);
o1 = Vec::max(minF, o1);
Vec::save(dest + (2 * x + 0) * pack, o0);
Vec::save(dest + (2 * x + 1) * pack, o1);
}
if (unit * 2 < ow) {
int offset = 4 * pack * unit, i = 0;
Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
for (i = 1; i < cacheLineSize; ++i) {
m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
}
auto o0 = m0 + m1 + m2 + biasF;
o0 = Vec::min(maxF, o0);
o0 = Vec::max(minF, o0);
Vec::save(dest + 2 * unit * pack, o0);
}
}
// unit: winograd unit (output is w/2)
void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
constexpr int pack = 8; // float16x8
for (int x = 0; x < su; ++x) {
auto dstX = dest + 4 * pack * x;
auto sx = x * 2 - (int)pad;
auto ex = sx + 4;
auto clampSx = std::max(sx, 0);
auto clampEx = std::min(ex, (int)iw);
Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int i = clampSx; i < clampEx; ++i) {
v[i - sx] = Vec::load(source + pack * i);
}
auto m0 = v[0] - v[2];
auto m1 = v[1] + v[2];
auto m2 = v[2] - v[1];
auto m3 = v[3] - v[1];
Vec::save(dstX + pack * 0, m0);
Vec::save(dstX + pack * 1, m1);
Vec::save(dstX + pack * 2, m2);
Vec::save(dstX + pack * 3, m3);
}
MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
for (int x = eu; x < unit; ++x) {
auto dstX = dest + 4 * pack * x;
auto sx = x * 2 - (int)pad;
auto ex = sx + 4;
auto clampSx = std::max(sx, 0);
auto clampEx = std::min(ex, (int)iw);
Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int i = clampSx; i < clampEx; ++i) {
v[i - sx] = Vec::load(source + pack * i);
}
auto m0 = v[0] - v[2];
auto m1 = v[1] + v[2];
auto m2 = v[2] - v[1];
auto m3 = v[3] - v[1];
Vec::save(dstX + pack * 0, m0);
Vec::save(dstX + pack * 1, m1);
Vec::save(dstX + pack * 2, m2);
Vec::save(dstX + pack * 3, m3);
}
}
void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
size_t cStride, size_t eSub, size_t hSub) {
const int pack = 8;
@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
}
}
static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
Vec dstValue(0.0f);
auto src_z = (const FLOAT16*)src;
auto weight_z = (const FLOAT16*)weight;
for (fy = 0; fy < fh; ++fy) {
auto src_y = src_z + fy * dilateY_step;
auto weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
auto weight_x = weight_y + 8 * fx;
auto src_x = src_y + fx * dilateX_step;
dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
}
}
Vec::save((FLOAT16*)dst, dstValue);
}
static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
@ -706,12 +600,8 @@ bool Arm82Functions::init() {
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
@ -754,6 +644,7 @@ bool Arm82Functions::init() {
FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
#endif
FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16);
#endif
FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);

View File

@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
if (MNN_LOW_MEMORY)
file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)

View File

@ -1,147 +0,0 @@
//
// MNNConvDwF23MulTransUnitFP16.S
// MNN
//
// Created by MNN on 2019/4/4.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvDwF23MulTransUnitFP16
//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
push {r4-r11, lr}
ldr r8, [sp, #36] // biasPtr
ldr r9, [sp, #40] // postParameters
ldr r10, [r9, #8] // minF
ldr r11, [r9, #12] // maxF
vpush {q4-q7}
ldr r4, [r0, #0]
ldr r5, [r0, #4]
ldr r6, [r0, #8]
vld1.16 {q4, q5}, [r1]!
vld1.16 {q6, q7}, [r1]!
vld1.16 {q8, q9}, [r1]!
L2:
cmp r3, #2
blt L1
LoopL2:
mov r7, r1
vld1.16 {q12, q13}, [r4]!
vmul.f16 q0, q4, q12
vld1.16 {q14, q15}, [r4]!
vmul.f16 q1, q5, q13
vld1.16 {q10, q11}, [r7]!
vmul.f16 q2, q6, q14
vld1.16 {q12, q13}, [r5]!
vmul.f16 q3, q7, q15
vmla.f16 q0, q8, q12
vld1.16 {q14, q15}, [r5]!
vmla.f16 q1, q9, q13
vmla.f16 q2, q10, q14
vmla.f16 q3, q11, q15
vld1.16 {q10, q11}, [r7]!
vld1.16 {q12, q13}, [r6]!
vmla.f16 q0, q10, q12
vmla.f16 q1, q11, q13
vld1.16 {q10, q11}, [r7]!
vadd.f16 q0, q1, q0
vld1.16 {q14, q15}, [r6]!
vmla.f16 q2, q10, q14
vmla.f16 q3, q11, q15
vadd.f16 q0, q0, q2
vadd.f16 q3, q3, q1
vsub.f16 q1, q3, q2
vld1.32 {q10}, [r8]
vdup.32 q11, r10
vdup.32 q12, r11
vcvt.f16.f32 d22, q11
vcvt.f16.f32 d24, q12
vmov.32 d23, d22
vmov.32 d25, d24
vadd.f16 q0, q10, q0
vadd.f16 q1, q10, q1
vmin.f16 q0, q12, q0
vmin.f16 q1, q12, q1
vmax.f16 q0, q11, q0
vmax.f16 q1, q11, q1
vst1.16 {q0, q1}, [r2]!
sub r3, r3, #2
cmp r3, #2
bge LoopL2
L1:
cmp r3, #0
beq End
mov r7, r1
mov r12, #32
vld1.16 {q12, q13}, [r4]!
vmul.f16 q0, q4, q12
vld1.16 {q14}, [r4]!
vmul.f16 q1, q5, q13
vld1.16 {q10}, [r7], r12
vmul.f16 q2, q6, q14
vld1.16 {q12, q13}, [r5]!
vmla.f16 q0, q8, q12
vld1.16 {q14}, [r5]!
vmla.f16 q1, q9, q13
vmla.f16 q2, q10, q14
vld1.16 {q10, q11}, [r7]!
vld1.16 {q12, q13}, [r6]!
vmla.f16 q0, q10, q12
vmla.f16 q1, q11, q13
vld1.16 {q10}, [r7]
vld1.16 {q14}, [r6]!
vmla.f16 q2, q10, q14
vadd.f16 q0, q1, q0
vadd.f16 q0, q0, q2
vld1.32 {q10}, [r8]
vdup.32 q11, r10
vdup.32 q12, r11
vcvt.f16.f32 d22, q11
vcvt.f16.f32 d24, q12
vmov.32 d23, d22
vmov.32 d25, d24
vadd.f16 q0, q10, q0
vmin.f16 q0, q12, q0
vmax.f16 q0, q11, q0
vst1.16 {q0}, [r2]!
End:
vpop {q4-q7}
pop {r4-r11, pc}
#endif
#endif

View File

@ -1,60 +0,0 @@
//
// MNNConvDwF23SourceTransUnitFP16.S
// MNN
//
// Created by MNN on 2019/4/4.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvDwF23SourceTransUnitFP16
// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
//Auto:
//r0: source, r1:dest, r2:unit
push {lr}
L1:
cmp r2, #0
beq End
vld1.16 {q8, q9}, [r0]!
vld1.16 {q10, q11}, [r0]!
subs r2, r2, #1
vsub.f16 q0, q8, q10
vadd.f16 q1, q9, q10
beq L1LoopEnd
L1Loop:
vsub.f16 q2, q10, q9
vst1.16 {q0, q1}, [r1]!
vsub.f16 q3, q11, q9
vmov.i32 q8, q10
vst1.16 {q2, q3}, [r1]!
vmov.i32 q9, q11
vld1.16 {q10, q11}, [r0]!
vsub.f16 q0, q8, q10
vadd.f16 q1, q9, q10
subs r2, r2, #1
bne L1Loop
L1LoopEnd:
vsub.f16 q2, q10, q9
vsub.f16 q3, q11, q9
vst1.16 {q0, q1}, [r1]!
vst1.16 {q2, q3}, [r1]!
End:
pop {pc}
#endif
#endif

View File

@ -16,26 +16,35 @@
asm_function MNNConvRunForLineDepthwiseFP16
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
// const float* bias, const float* parameters)
//Auto Load:
//r0:dst, r1:src, r2:weight, r3:width
push {r4-r11, lr}
push {r4-r8, r10, r11, lr}
//Load From Sp
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
ldr r4, [sp, #36]
ldr r5, [sp, #40]
ldr r6, [sp, #44]
ldr r7, [sp, #48]
ldr r8, [sp, #52]
ldr r9, [sp, #56]
ldr r10, [sp, #60]
ldr r11, [sp, #64]
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
ldr r4, [sp, #32]
ldr r5, [sp, #36]
ldr r6, [sp, #40]
ldr r7, [sp, #44]
ldr r8, [sp, #48]
ldr lr, [sp, #52]
ldr r10, [sp, #56]
ldr r11, [sp, #60]
ldr r12, [sp, #64] // bias
vld1.32 {q0}, [r12] // bias
ldr r12, [sp, #68] // min,max
vld1.32 {d2[0]}, [r12]!
vld1.32 {d2[1]}, [r12]
vpush {q4-q7}
vmov.f32 q5, q0 // bias
vdup.f32 q4, d2[0] // min
vdup.f32 q6, d2[1] // max
mov r12, #2 // sizeof(FLOAT16)
mul r4, r12, r4
@ -49,7 +58,7 @@ mul r12, r5, r7
sub r8, r8, r12
LoopDY:
push {r0, r1, r3, r9, r10, r11}
push {r0, r1, r3, r10, r11, lr}
L8:
cmp r3, #7
@ -59,18 +68,18 @@ mov r12, #8
mul r12, r4, r12
L8Loop:
vmov.i32 q8, #0
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
vmov.i32 q12, #0
vmov.i32 q13, #0
vmov.i32 q14, #0
vmov.i32 q15, #0
vmov.f32 q8, q5 // use bias to init
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
vmov.f32 q12, q5
vmov.f32 q13, q5
vmov.f32 q14, q5
vmov.f32 q15, q5
vmov.i32 d14[0], r1
vmov.i32 d14[1], r2
mov r9, r6
mov lr, r6
L8LoopH:
mov r10, r5
L8LoopW:
@ -98,11 +107,27 @@ L8Loop:
bne L8LoopW
L8LoopWEnd:
subs r9, r9, #1
subs lr, lr, #1
add r1, r1, r8
bne L8LoopH
sub r3, r3, #8
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmax.f32 q12, q12, q4
vmax.f32 q13, q13, q4
vmax.f32 q14, q14, q4
vmax.f32 q15, q15, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
vmin.f32 q12, q12, q6
vmin.f32 q13, q13, q6
vmin.f32 q14, q14, q6
vmin.f32 q15, q15, q6
vst1.16 {q8, q9}, [r0]!
vmov.i32 r1, d14[0]
vmov.i32 r2, d14[1]
@ -121,14 +146,14 @@ mov r12, #4
mul r12, r4, r12
L4Loop:
vmov.i32 q8, #0
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
vmov.f32 q8, q5
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
vmov.i32 d8[0], r1
vmov.i32 d9[0], r2
mov r9, r6
vmov.i32 d14[0], r1
vmov.i32 d14[1], r2
mov lr, r6
L4LoopH:
mov r10, r5
L4LoopW:
@ -147,14 +172,22 @@ L4Loop:
add r1, r1, r7
bne L4LoopW
subs r9, r9, #1
subs lr, lr, #1
add r1, r1, r8
bne L4LoopH
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
sub r3, r3, #4
vst1.16 {q8, q9}, [r0]!
vmov.i32 r1, d8[0]
vmov.i32 r2, d9[0]
vmov.i32 r1, d14[0]
vmov.i32 r2, d14[1]
vst1.16 {q10, q11}, [r0]!
add r1, r1, r12
cmp r3, #4
@ -168,8 +201,8 @@ cmp r3, #0
beq End
L1Loop:
vmov.i32 q0, #0
mov r9, r6
vmov.f32 q0, q5
mov lr, r6
mov r11, r1
mov r12, r2
L1LoopH:
@ -180,10 +213,12 @@ L1Loop:
vmla.f16 q0, q1, q2
subs r10, r10, #1
bne L1LoopW
subs r9, r9, #1
subs lr, lr, #1
add r1, r1, r8
bne L1LoopH
vmax.f32 q0, q0, q4
vmin.f32 q0, q0, q6
subs r3, r3, #1
vst1.16 {q0}, [r0]!
mov r2, r12
@ -193,16 +228,15 @@ L1Loop:
End:
pop {r0, r1, r3, r9, r10, r11}
pop {r0, r1, r3, r10, r11, lr}
add r0, r0, r11
subs r9, r9, #1
subs lr, lr, #1
add r1, r1, r10
bne LoopDY
vpop {q4-q7}
pop {r4-r11, pc}
pop {r4-r8, r10, r11, pc}
#endif
#endif

View File

@ -1,122 +0,0 @@
//
// MNNConvDwF23MulTransUnitFP16.S
// MNN
//
// Created by MNN on 2019/4/4.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvDwF23MulTransUnitFP16
//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
stp d10, d11, [sp, #-32]!
stp d8, d9, [sp, #16]
ld1 {v8.8h}, [x4] // bias
ldr w9, [x5, #8]
ldr w10, [x5, #12]
dup v9.4s, w9 // min
dup v10.4s, w10 // max
fcvtn v9.4h, v9.4s
fcvtn v10.4h, v10.4s
dup v9.8h, v9.h[0]
dup v10.8h, v10.h[0]
ldr x4, [x0, #0]
ldr x5, [x0, #8]
ldr x6, [x0, #16]
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
L2:
cmp x3, #2
blt L1
LoopL2:
ld1 {v20.8h, v21.8h}, [x4], #32
fmul v0.8h, v4.8h, v20.8h
ld1 {v22.8h, v23.8h}, [x4], #32
fmul v1.8h, v5.8h, v21.8h
fmul v2.8h, v6.8h, v22.8h
ld1 {v20.8h, v21.8h}, [x5], #32
fmul v3.8h, v7.8h, v23.8h
fmla v0.8h, v16.8h, v20.8h
ld1 {v22.8h, v23.8h}, [x5], #32
fmla v1.8h, v17.8h, v21.8h
fmla v2.8h, v18.8h, v22.8h
fmla v3.8h, v19.8h, v23.8h
ld1 {v20.8h, v21.8h}, [x6], #32
fmla v0.8h, v28.8h, v20.8h
fmla v1.8h, v29.8h, v21.8h
fadd v0.8h, v1.8h, v0.8h
ld1 {v22.8h, v23.8h}, [x6], #32
fmla v2.8h, v30.8h, v22.8h
fmla v3.8h, v31.8h, v23.8h
fadd v0.8h, v0.8h, v2.8h
fadd v3.8h, v3.8h, v1.8h
fsub v1.8h, v3.8h, v2.8h
fadd v0.8h, v0.8h, v8.8h
fadd v1.8h, v1.8h, v8.8h
fmin v0.8h, v0.8h, v10.8h
fmin v1.8h, v1.8h, v10.8h
fmax v0.8h, v0.8h, v9.8h
fmax v1.8h, v1.8h, v9.8h
st1 {v0.8h, v1.8h}, [x2], #32
sub x3, x3, #2
cmp x3, #2
bge LoopL2
L1:
cmp x3, #0
beq End
ld1 {v20.8h, v21.8h, v22.8h}, [x4]
fmul v0.8h, v4.8h, v20.8h
fmul v1.8h, v5.8h, v21.8h
fmul v2.8h, v6.8h, v22.8h
ld1 {v20.8h, v21.8h, v22.8h}, [x5]
fmla v0.8h, v16.8h, v20.8h
fmla v1.8h, v17.8h, v21.8h
fmla v2.8h, v18.8h, v22.8h
ld1 {v20.8h, v21.8h, v22.8h}, [x6]
fmla v0.8h, v28.8h, v20.8h
fmla v1.8h, v29.8h, v21.8h
fadd v0.8h, v1.8h, v0.8h
fmla v2.8h, v30.8h, v22.8h
fadd v0.8h, v0.8h, v2.8h
fadd v0.8h, v0.8h, v8.8h
fmin v0.8h, v0.8h, v10.8h
fmax v0.8h, v0.8h, v9.8h
st1 {v0.8h}, [x2]
End:
ldp d8, d9, [sp, #16]
ldp d10, d11, [sp], #32
ret
#endif

View File

@ -1,56 +0,0 @@
//
// MNNConvDwF23SourceTransUnitFP16.S
// MNN
//
// Created by MNN on 2019/4/4.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvDwF23SourceTransUnitFP16
// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
//Auto:
//x0: source, x1:dest, x2:unit
L1:
cmp x2, #0
beq End
ld1 {v16.8h, v17.8h}, [x0], #32
ld1 {v18.8h, v19.8h}, [x0], #32
subs x2, x2, #1
fsub v0.8h, v16.8h, v18.8h
fadd v1.8h, v17.8h, v18.8h
beq L1LoopEnd
L1Loop:
fsub v2.8h, v18.8h, v17.8h
st1 {v0.8h, v1.8h}, [x1], #32
fsub v3.8h, v19.8h, v17.8h
mov v16.16b, v18.16b
st1 {v2.8h, v3.8h}, [x1], #32
mov v17.16b, v19.16b
ld1 {v18.8h, v19.8h}, [x0], #32
fsub v0.8h, v16.8h, v18.8h
fadd v1.8h, v17.8h, v18.8h
subs x2, x2, #1
bne L1Loop
L1LoopEnd:
fsub v2.8h, v18.8h, v17.8h
fsub v3.8h, v19.8h, v17.8h
st1 {v0.8h, v1.8h}, [x1], #32
st1 {v2.8h, v3.8h}, [x1], #32
End:
ret
#endif

View File

@ -15,17 +15,24 @@
asm_function MNNConvRunForLineDepthwiseFP16
//void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
// const float* bias, float* parameters)
//Auto Load:
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
//Load From sp:
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters
ldr x8, [sp, #0]
ldr x15, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
stp d8, d9, [sp, #(-16 * 3)]!
stp d10, d11, [sp, #(16 * 2)]
stp x19, x20, [sp, #(16 * 1)]
mov x9, #2 // sizeof(FLOAT16)
mul x4, x9, x4
@ -34,15 +41,30 @@ mul x8, x9, x8
mul x10, x9, x10
mul x11, x9, x11
ld1 {v8.8h}, [x12] // bias
ld1r {v10.8h}, [x13], #2 // min
ld1r {v11.8h}, [x13]
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
mul x9, x5, x7
sub x8, x8, x9
.macro zero_vec x0, x1, x2, x3
movi \x0\().8h, #0
movi \x1\().8h, #0
movi \x2\().8h, #0
movi \x3\().8h, #0
.macro assign_bias x0, x1, x2, x3
mov \x0\().16b, v8.16b
mov \x1\().16b, v8.16b
mov \x2\().16b, v8.16b
mov \x3\().16b, v8.16b
.endm
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
fmax \x0\().8h, \x0\().8h, \xmin\().8h
fmax \x1\().8h, \x1\().8h, \xmin\().8h
fmax \x2\().8h, \x2\().8h, \xmin\().8h
fmax \x3\().8h, \x3\().8h, \xmin\().8h
fmin \x0\().8h, \x0\().8h, \xmax\().8h
fmin \x1\().8h, \x1\().8h, \xmax\().8h
fmin \x2\().8h, \x2\().8h, \xmax\().8h
fmin \x3\().8h, \x3\().8h, \xmax\().8h
.endm
LoopDY:
@ -56,16 +78,16 @@ L16:
cmp x3, #16
blt L8
mov x12, #16
mul x12, x4, x12
mov x19, #16
mul x19, x4, x19
L16Loop:
zero_vec v16, v17, v18, v19
zero_vec v20, v21, v22, v23
zero_vec v24, v25, v26, v27
zero_vec v28, v29, v30, v31
assign_bias v16, v17, v18, v19
assign_bias v20, v21, v22, v23
assign_bias v24, v25, v26, v27
assign_bias v28, v29, v30, v31
mov x13, x1
mov x20, x1
mov x14, x2
mov x9, x6
L16LoopH:
@ -106,7 +128,7 @@ L16Loop:
ld1 {v3.8h}, [x1], x4
fmla v30.8h, v7.8h, v2.8h
fmla v31.8h, v7.8h, v3.8h
sub x1, x1, x12
sub x1, x1, x19
add x1, x1, x7
bne L16LoopW
@ -115,8 +137,12 @@ L16Loop:
bne L16LoopH
sub x3, x3, #16
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
compare_min_max v24, v25, v26, v27, v10, v11
compare_min_max v28, v29, v30, v31, v10, v11
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x12
add x1, x20, x19
cmp x3, #16
mov x2, x14
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@ -129,14 +155,14 @@ L8:
cmp x3, #7
ble L4
mov x12, #8
mul x12, x4, x12
mov x19, #8
mul x19, x4, x19
L8Loop:
zero_vec v16, v17, v18, v19
zero_vec v20, v21, v22, v23
assign_bias v16, v17, v18, v19
assign_bias v20, v21, v22, v23
mov x13, x1
mov x20, x1
mov x14, x2
mov x9, x6
L8LoopH:
@ -161,7 +187,7 @@ L8Loop:
ld1 {v1.8h}, [x1], x4
fmla v23.8h, v1.8h, v3.8h
sub x1, x1, x12
sub x1, x1, x19
add x1, x1, x7
bne L8LoopW
@ -169,9 +195,12 @@ L8Loop:
add x1, x1, x8
bne L8LoopH
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
sub x3, x3, #8
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x12
add x1, x20, x19
mov x2, x14
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@ -180,13 +209,13 @@ L4:
cmp x3, #4
ble L1
mov x12, #4
mul x12, x4, x12
mov x19, #4
mul x19, x4, x19
L4Loop:
zero_vec v16, v17, v18, v19
assign_bias v16, v17, v18, v19
mov x13, x1
mov x20, x1
mov x14, x2
mov x9, x6
L4LoopH:
@ -203,7 +232,7 @@ L4Loop:
ld1 {v1.8h}, [x1], x4
fmla v19.8h, v1.8h, v3.8h
sub x1, x1, x12
sub x1, x1, x19
add x1, x1, x7
bne L4LoopW
@ -211,9 +240,10 @@ L4Loop:
add x1, x1, x8
bne L4LoopH
compare_min_max v16, v17, v18, v19, v10, v11
sub x3, x3, #4
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x12
add x1, x20, x19
mov x2, x14
L1:
@ -221,10 +251,10 @@ cmp x3, #0
beq End
L1Loop:
movi v0.8h, #0
mov v0.16b, v8.16b
mov x9, x6
mov x11, x1
mov x12, x2
mov x19, x2
L1LoopH:
mov x10, x5
L1LoopW:
@ -238,8 +268,10 @@ L1Loop:
bne L1LoopH
subs x3, x3, #1
fmax v0.8h, v0.8h, v10.8h
fmin v0.8h, v0.8h, v11.8h
st1 {v0.8h}, [x0], #16
mov x2, x12
mov x2, x19
add x1, x11, x4
bne L1Loop
@ -257,7 +289,9 @@ add x0, x0, x11
add x1, x1, x10
bne LoopDY
ldp x19, x20, [sp, #(16 * 1)]
ldp d10, d11, [sp, #(16 * 2)]
ldp d8, d9, [sp], #(16 * 3)
ret
#endif

View File

@ -0,0 +1,290 @@
//
// MNNDepthwiseConvFastKernelFP16.S
// MNN
//
// Created by MNN on 2024/09/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNDepthwiseConvFastKernelFP16
// void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
// size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
//Auto Load:
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
//Load From sp:
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
ldr x8, [sp, #0]
ldr x15, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
stp d14, d15, [sp, #(-16 * 9)]!
stp d12, d13, [sp, #(16 * 1)]
stp d10, d11, [sp, #(16 * 2)]
stp d8, d9, [sp, #(16 * 3)]
stp x21, x22, [sp, #(16 * 4)]
stp x19, x20, [sp, #(16 * 5)]
stp x27, x28, [sp, #(16 * 6)]
stp x25, x26, [sp, #(16 * 7)]
stp x23, x24, [sp, #(16 * 8)]
lsl x4, x4, #1 // src_w_step*sizeof(float)
lsl x7, x7, #1 // dilate_x_step*sizeof(float)
lsl x8, x8, #1 // dilate_y_step*sizeof(float)
lsl x23, x10, #1 // srcHStep*sizeof(float)
lsl x24, x11, #1 // dstHStep*sizeof(float)
mov x20, x12 // bias
mov x26, x13 // min
add x27, x13, #2 // max
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
mul x9, x5, x7
sub x8, x8, x9
mov x25, x3 // width
.macro assign_bias x0, x1, x2, x3, bv
mov \x0\().16b, \bv\().16b
mov \x1\().16b, \bv\().16b
mov \x2\().16b, \bv\().16b
mov \x3\().16b, \bv\().16b
.endm
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
fmax \x0\().8h, \x0\().8h, \xmin\().8h
fmax \x1\().8h, \x1\().8h, \xmin\().8h
fmax \x2\().8h, \x2\().8h, \xmin\().8h
fmax \x3\().8h, \x3\().8h, \xmin\().8h
fmin \x0\().8h, \x0\().8h, \xmax\().8h
fmin \x1\().8h, \x1\().8h, \xmax\().8h
fmin \x2\().8h, \x2\().8h, \xmax\().8h
fmin \x3\().8h, \x3\().8h, \xmax\().8h
.endm
LoopDY:
//mov x23, x10
//mov x24, x11
mov x21, x0
mov x22, x1
L16:
cmp x3, #16
blt L8
mov x12, #-176
mov x19, #256
L16Loop:
ld1 {v8.8h}, [x20] // load bias
assign_bias v16, v17, v18, v19, v8
assign_bias v20, v21, v22, v23, v8
assign_bias v24, v25, v26, v27, v8
assign_bias v28, v29, v30, v31, v8
mov x13, x1
mov x14, x2
mov x9, x6
L16LoopH:
mov x10, x5
L16LoopW:
ld1 {v8.8h}, [x2], #16
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64
subs x10, x10, #1
fmla v16.8h, v8.8h, v0.8h
fmla v17.8h, v8.8h, v1.8h
fmla v18.8h, v8.8h, v2.8h
fmla v19.8h, v8.8h, v3.8h
fmla v20.8h, v8.8h, v4.8h
fmla v21.8h, v8.8h, v5.8h
fmla v22.8h, v8.8h, v6.8h
fmla v23.8h, v8.8h, v7.8h
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
fmla v24.8h, v8.8h, v9.8h
fmla v25.8h, v8.8h, v10.8h
fmla v26.8h, v8.8h, v11.8h
fmla v27.8h, v8.8h, v12.8h
fmla v28.8h, v8.8h, v0.8h
fmla v29.8h, v8.8h, v1.8h
fmla v30.8h, v8.8h, v2.8h
fmla v31.8h, v8.8h, v3.8h
bne L16LoopW
subs x9, x9, #1
add x1, x1, x8
bne L16LoopH
ld1r {v10.8h}, [x26] // min
ld1r {v11.8h}, [x27] // max
sub x3, x3, #16
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
compare_min_max v24, v25, v26, v27, v10, v11
compare_min_max v28, v29, v30, v31, v10, v11
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x19 // 16 * pack * sizeof(float)
cmp x3, #16
mov x2, x14
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
bge L16Loop
L8:
ld1r {v10.8h}, [x26] // min
ld1r {v11.8h}, [x27] // max
ld1 {v24.8h}, [x20] // load bias
cmp x3, #7
ble L4
mov x12, #-48
mov x19, #128
L8Loop:
assign_bias v16, v17, v18, v19, v24
assign_bias v20, v21, v22, v23, v24
mov x13, x1
mov x14, x2
mov x9, x6
L8LoopH:
mov x10, x5
L8LoopW:
ld1 {v8.8h}, [x2], #16
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12
subs x10, x10, #1
fmla v16.8h, v8.8h, v0.8h
fmla v17.8h, v8.8h, v1.8h
fmla v18.8h, v8.8h, v2.8h
fmla v19.8h, v8.8h, v3.8h
fmla v20.8h, v8.8h, v4.8h
fmla v21.8h, v8.8h, v5.8h
fmla v22.8h, v8.8h, v6.8h
fmla v23.8h, v8.8h, v7.8h
bne L8LoopW
subs x9, x9, #1
add x1, x1, x8
bne L8LoopH
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
sub x3, x3, #8
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x19 // 8 * pack * sizeof(float)
mov x2, x14
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
L4:
cmp x3, #4
ble L1
mov x12, #16
mov x19, #64
L4Loop:
assign_bias v16, v17, v18, v19, v24
mov x13, x1
mov x14, x2
mov x9, x6
L4LoopH:
mov x10, x5
L4LoopW:
ld1 {v8.8h}, [x2], #16
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
subs x10, x10, #1
fmla v16.8h, v8.8h, v0.8h
fmla v17.8h, v8.8h, v1.8h
fmla v18.8h, v8.8h, v2.8h
fmla v19.8h, v8.8h, v3.8h
bne L4LoopW
subs x9, x9, #1
add x1, x1, x8
bne L4LoopH
compare_min_max v16, v17, v18, v19, v10, v11
sub x3, x3, #4
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
add x1, x13, x19
mov x2, x14
L1:
cmp x3, #0
beq End
mov x19, #16
L1Loop:
ld1 {v16.8h}, [x20] // assign bias
mov x13, x1
mov x14, x2
mov x9, x6
L1LoopH:
mov x10, x5
L1LoopW:
ld1 {v8.8h}, [x2], #16
ld1 {v0.8h}, [x1], #16
subs x10, x10, #1
fmla v16.8h, v8.8h, v0.8h
bne L1LoopW
subs x9, x9, #1
add x1, x1, x8
bne L1LoopH
subs x3, x3, #1
fmax v16.8h, v16.8h, v10.8h
fmin v16.8h, v16.8h, v11.8h
st1 {v16.8h}, [x0], #16
add x1, x13, x4
mov x2, x14
bne L1Loop
End:
//mov x10, x23
//mov x11, x24
//mov x0, x21
//mov x1, x22
mov x3, x25
subs x15, x15, #1
add x0, x21, x24
add x1, x22, x23
bne LoopDY
ldp x23, x24, [sp, #(16 * 8)]
ldp x25, x26, [sp, #(16 * 7)]
ldp x27, x28, [sp, #(16 * 6)]
ldp x19, x20, [sp, #(16 * 5)]
ldp x21, x22, [sp, #(16 * 4)]
ldp d8, d9, [sp, #(16 * 3)]
ldp d10, d11, [sp, #(16 * 2)]
ldp d12, d13, [sp, #(16 * 1)]
ldp d14, d15, [sp], #(16 * 9)
ret
#endif

View File

@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)]
ldr x25, [x6, #40] // xKernelSum
ldr x26, [x6, #48] // weightQuantBias
ldr x23, [x6, #56] // fp32minmax
ldr x27, [x6, #64] // blockNum
//add x24, x23, #4
mov x21, #16 // sizeof(float16_t) * PACK
mul x27, x27, x3
Start:
lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
mov x22, #48 // src_steps
ldr x27, [x6, #80] // extra scale
TILE_12:

View File

@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)]
ldr x25, [x6, #40] // xKernelSum
ldr x26, [x6, #48] // weightQuantBias
ldr x23, [x6, #56] // fp32minmax
ldr x27, [x6, #64] // blockNum
mov x21, #16 // sizeof(float16_t) * PACK
mul x27, x27, x3
Start:
lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
mov x22, #48 // src_steps
ldr x27, [x6, #80] // extra scale
TILE_12:

View File

@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)]
// ldr w23, [x6, #24]
ldr x27, [x6, #40] // srcKernelSum
ldr x28, [x6, #48] // weightQuanBias
ldr x23, [x6, #64] // blockNum
ldr x14, [x6, #56] // fp32minmax
mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
mov x21, #16 // sizeof(float16_t) * UNIT
Start:
lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
ldr x23, [x6, #80] // extra scale
TILE_10:
cmp x7, #10

View File

@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)]
// ldr w23, [x6, #24]
ldr x27, [x6, #40] // srcKernelSum
ldr x28, [x6, #48] // weightQuanBias
ldr x23, [x6, #64] // blockNum
ldr x14, [x6, #56] // fp32minmax
mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
mov x21, #16 // sizeof(float16_t) * UNIT
Start:
lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
ldr x23, [x6, #80] // extra scale
TILE_10:
cmp x7, #10

View File

@ -42,9 +42,11 @@ ENDIF()
# ARM82 Assemblies
IF(MNN_ARM82)
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
list(APPEND MNN_TARGETS MNN_Arm82)
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
ENDIF()
ENDIF()

View File

@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
return NO_ERROR;
}
void CPURuntime::computeDivideSizes(int size, int* dst) const {
void CPUBackend::computeDivideSizes(int size, int* dst) const {
if (mGroupWithComputeRate.size() <= 1) {
// Avg divide
int length = UP_DIV(size, mThreadNumber);
@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const {
#endif
}
void CPURuntime::_resetGroupCompute() const {
if (mPastDecreaseHint == hint().cpuDecreaseRate) {
return;
}
mGroupWithComputeRate.clear();
if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) {
return;
}
mPastDecreaseHint = hint().cpuDecreaseRate;
auto cpuInfo = MNNGetCPUInfo();
if (cpuInfo->groups.size() < 2) {
return;
}
float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f;
int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
int groupIndex = (int)cpuInfo->groups.size()-2;
float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
float totalComputeRate = 1.0f * validCpuSize;
mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
float currentRate = 1.0f;
while (validCpuSize < mThreadNumber && groupIndex >= 0) {
auto& group = cpuInfo->groups[groupIndex];
int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
validCpuSize += group.ids.size();
currentRate *= decreaseRate;
totalComputeRate += currentRate * selectSize;
mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
}
for (auto& g : mGroupWithComputeRate) {
g.first = g.first / totalComputeRate;
}
}
void CPURuntime::_resetThreadPool() {
mThreadNumber = std::max(1, mThreadNumber);
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() {
}
mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
}
mGroupWithComputeRate.clear();
if (mThreadNumber > 1) {
mTaskIndex = ThreadPool::acquireWorkIndex();
if (-1 == mTaskIndex) {
@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
}
mThreadNumber = numberThread;
_resetThreadPool();
// Mask Group Compute reset
mPastDecreaseHint = -1;
}
CPURuntime::CPURuntime(const Backend::Info& info) {
@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
auto cpuBn = static_cast<CPUBackend*>(origin);
mSharedDmaInfo = cpuBn->mDmaInfo;
}
_resetGroupCompute();
if (nullptr != config) {
precision = config->precision;
flags = config->flags;
@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
#endif
mMemory = memory;
mRuntime = const_cast<CPURuntime*>(runtime);
mThreadNumber = mRuntime->mThreadNumber;
// Compute Group Rate
do {
if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
break;
}
auto rate = mRuntime->hint().cpuDecreaseRate;
if (rate >= 100 || rate <= 0) {
break;
}
auto cpuInfo = MNNGetCPUInfo();
if (cpuInfo->groups.size() < 2) {
break;
}
mGroupWithComputeRate.clear();
float decreaseRate = (float)(rate) / 100.0f;
int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
int groupIndex = (int)cpuInfo->groups.size()-2;
float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
float totalComputeRate = 1.0f * validCpuSize;
mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
float currentRate = 1.0f;
while (validCpuSize < mThreadNumber && groupIndex >= 0) {
auto& group = cpuInfo->groups[groupIndex];
int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
validCpuSize += group.ids.size();
currentRate *= decreaseRate;
totalComputeRate += currentRate * selectSize;
mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
}
for (auto& g : mGroupWithComputeRate) {
g.first = g.first / totalComputeRate;
}
} while (false);
auto dynamicAlloc = mRuntime->mSharedDmaInfo;
if (nullptr == dynamicAlloc.get()) {
mDmaInfo.reset(new CPURuntime::DynamicAllocator);

View File

@ -40,9 +40,6 @@ public:
void onConcurrencyEnd() const;
virtual bool onCheckInfo(Backend::Info& info) const override;
// dividedSize's length should be larger than threadNumber
void computeDivideSizes(int size, int* dst) const;
#ifdef MNN_USE_THREAD_POOL
inline bool multiThreadValid() const {
return mThreadOpen;
@ -60,9 +57,6 @@ private:
mutable int mTaskIndex = -1;
mutable bool mThreadOpen = false;
#endif
void _resetGroupCompute() const;
mutable std::vector<std::pair<float, int>> mGroupWithComputeRate;
mutable int mPastDecreaseHint = -1;
BackendConfig::MemoryMode mMemory;
BackendConfig::PowerMode mPower;
BackendConfig::PrecisionMode mPrecision;
@ -108,6 +102,8 @@ public:
// Return sizeDivide, scheduleNumber aligned memory
std::pair<int, int> multiThreadDivide(int size) const;
virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
// dividedSize's length should be larger than threadNumber
void computeDivideSizes(int size, int* dst) const;
public:
virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
@ -145,7 +141,7 @@ public:
static bool addCreator(OpType t, Creator* c);
inline int threadNumber() const {
return mRuntime->mThreadNumber;
return mThreadNumber;
}
#ifdef MNN_USE_THREAD_POOL
inline bool threadOpen() const {
@ -182,6 +178,9 @@ protected:
CoreFunctions* mCoreFunctions;
CoreInt8Functions* mInt8CoreFunctions;
private:
int mThreadNumber;
std::vector<std::pair<float, int>> mGroupWithComputeRate;
std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
CPURuntime* mRuntime;

View File

@ -14,7 +14,6 @@
#include "core/TensorUtils.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "backend/cpu/compute/ConvOpt.h"
#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
namespace MNN {
CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
auto core = static_cast<CPUBackend*>(backend())->functions();
int bytes = core->bytes;
int unit = core->pack;
auto unitFunc = core->MNNConvRunForUnitDepthWise;
auto lineFunc = core->MNNConvRunForLineDepthwise;
auto kernelFunc = core->MNNConvRunForLineDepthwise;
auto postFunc = core->MNNAxByClampBroadcastUnit;
auto inputTensor = inputs[0];
auto outputTensor = outputs[0];
@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
int weight_z_step = kernel_height * kernel_width * unit;
int dilateY_step = dilateY * src_width * unit;
int dilateX_step = dilateX * unit;
// Compute Mid Rect
int l = 0, t = 0, r = dst_width, b = dst_height;
for (; l * strideX - padX < 0 && l < dst_width; l++) {
// do nothing
}
for (; t * strideY - padY < 0 && t < dst_height; t++) {
// do nothing
}
for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
// do nothing
}
for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
// do nothing
}
auto postData = getPostParameters();
auto batch = inputs[0]->batch();
int total = batch * dst_depth_quad;
int numberThread = ((CPUBackend*)backend())->threadNumber();
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
auto runBasic = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
for (int dy = T; dy < B; ++dy) {
auto dst_y = dst_z + dy * dst_y_step * bytes;
int srcStartY = dy * strideY - padY;
const auto src_dy = src_z + srcStartY * src_y_step * bytes;
int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
for (int dx = L; dx < R; ++dx) {
auto dst_x = dst_y + unit * dx * bytes;
int srcStartX = dx * strideX - padX;
const auto src_dx = src_dy + srcStartX * unit * bytes;
int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
(const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
unit * kernel_width, dilateX_step, dilateY_step);
}
}
};
std::vector<int> divides(numberThread+1);
divides[0] = 0;
rt->computeDivideSizes(total, divides.data()+1);
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
mNumber = numberThread;
auto postData = getPostParameters();
if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
}
mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel);
if (mFastKernelApply ) { // Only support ARM kernel
kernelFunc = core->MNNDepthwiseConvFastKernel;
}
auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon);
int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width;
int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height;
mInputPad.reset(Tensor::createDevice<float>({mNumber, paddedWidth * paddedHeight * unit}));
bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC);
if (!succ) {
return OUT_OF_MEMORY;
}
if (paddedWidth != src_width) {
dilateY_step = dilateY * paddedWidth * unit;
src_y_step = paddedWidth * unit;
}
mExecutor = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
auto biasP = inputs[2]->host<uint8_t>();
auto weightP = inputs[1]->host<uint8_t>();
for (int index = divides[tId]; index < divides[tId+1]; ++index) {
int dz = index / batch;
auto dst_z = dstOrigin + dst_z_step * index * bytes;
const auto src_z = srcOrigin + src_z_step * index * bytes;
auto dstOrigin = outputPtr + dst_z_step * index * bytes;
const auto srcOrigin = inputPtr + src_z_step * index * bytes;
auto bias_z = biasP + unit * dz * bytes;
const auto weight_dz = weightP + dz * weight_z_step * bytes;
runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
if (r > l && b > t) {
lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
(const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
(const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
dilateY_step, b - t, src_y_step * strideY, dst_y_step);
auto srcPtr = srcOrigin;
// Pad inputs
for (int y = 0; y < src_height; ++y) {
auto src = srcOrigin + y * src_width * unit * bytes;
auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes;
::memcpy(dst, src, src_width * unit * bytes);
}
postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
// Compute
kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2);
}
};
mNumber = numberThread;
backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC);
return NO_ERROR;
}
@ -281,11 +267,6 @@ public:
if (inputs.empty()) {
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
}
auto core = static_cast<CPUBackend*>(backend)->functions();
if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 &&
conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) {
return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize);
}
return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
}
};

View File

@ -26,7 +26,12 @@ public:
private:
std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
std::function<void(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep)> mFastKernel;
int mNumber = 1;
std::shared_ptr<Tensor> mInputPad;
bool mFastKernelApply = false;
};
class MultiInputFloatExecution : public BasicFloatExecution {
public:

View File

@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
int size_ = mMutableResource.mBiasInt32->length(0);
if (core->ConvDepthwise3x3LineInt8_ARM82) {
if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) {
if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) {
mUse3x3Kernel = true;
mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
UNIT = 4;
@ -247,7 +247,7 @@ public:
if (core->ConvDepthwise3x3LineInt8_ARM82) {
if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
&& common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
&& common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
use3x3kernel = true;
UNIT = 4;
}

View File

@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
auto outW = outputTensor->buffer().dim[4].extent;
auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
auto tileCount = outD;
auto inOffset = batches * inH * inW * core->pack;
auto outOffset = batches * outH * outW * core->pack;
auto inOffset = batches * inD * inH * inW * core->pack;
auto outOffset = batches * outD * outH * outW * core->pack;
auto cordPtr = mTempCordBuffer->host<uint8_t>();
for (auto b = 0; b < batches; ++b) {
auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
// Compute cord
MNN_CONCURRENCY_BEGIN(tId, threadCount) {
for (int index=tId; index < tileCount; index += threadCount) {
auto c = index / outD;
auto d = index % outD;
auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
auto d = index;
auto inputC = _inputPtr;
auto outputC = _outputPtr;
auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
for (int h = 0; h < outH; h++) {

View File

@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
}
group.ids = _readNumber((const char*)buffer.get(), buffer.size());
}
if (group.ids.empty()) {
continue;
}
std::string minfreq = policyName + "/cpuinfo_min_freq";
{
MNN::AutoStorage<uint8_t> buffer;
@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
_getInfoApple(cpuinfo_isa);
#endif
#if defined(__aarch64__) && defined(_WIN32)
cpuinfo_isa->fp16arith = true;
cpuinfo_isa->dot = true;
#endif
MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
return;
}

View File

@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
}
return ((d * height + h) * width + w) * 4;
return ((d * height + h) * width + w) * PACK;
}
static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {

View File

@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
if (MNN_SUPPORT_BF16)
target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
message(STATUS "Enabling AArch64 Assemblies")
add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
endif()
if(MNN_ARM82)
message(STATUS "Enable INT8 SDOT")
target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82)
endif()
else()
# Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
endif()

View File

@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s
const float* postParameters, const float* bias, const float* k, const float* b);
void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
const float* postParameters, const float* bias, const float* k, const float* b);
void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
size_t height, size_t srcHStep, size_t dstHStep);

View File

@ -34,8 +34,16 @@ ldr r8, [sp, #48]
ldr lr, [sp, #52]
ldr r10, [sp, #56]
ldr r11, [sp, #60]
ldr r12, [sp, #64] // bias
vld1.32 {q0}, [r12] // bias
ldr r12, [sp, #68] // min,max
vld1.32 {d2[0]}, [r12]!
vld1.32 {d2[1]}, [r12]
vpush {q4-q7}
vmov.f32 q5, q0 // bias
vdup.f32 q4, d2[0] // min
vdup.f32 q6, d2[1] // max
mov r12, #4
mul r4, r12, r4
@ -59,14 +67,14 @@ mov r12, #8
mul r12, r4, r12
L8Loop:
vmov.i32 q8, #0
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
vmov.i32 q12, #0
vmov.i32 q13, #0
vmov.i32 q14, #0
vmov.i32 q15, #0
vmov.f32 q8, q5 // use bias to init
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
vmov.f32 q12, q5
vmov.f32 q13, q5
vmov.f32 q14, q5
vmov.f32 q15, q5
vmov.i32 d14[0], r1
vmov.i32 d14[1], r2
@ -103,6 +111,22 @@ L8Loop:
bne L8LoopH
sub r3, r3, #8
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmax.f32 q12, q12, q4
vmax.f32 q13, q13, q4
vmax.f32 q14, q14, q4
vmax.f32 q15, q15, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
vmin.f32 q12, q12, q6
vmin.f32 q13, q13, q6
vmin.f32 q14, q14, q6
vmin.f32 q15, q15, q6
vst1.32 {q8, q9}, [r0]!
vmov.i32 r1, d14[0]
vmov.i32 r2, d14[1]
@ -121,13 +145,13 @@ mov r12, #4
mul r12, r4, r12
L4Loop:
vmov.i32 q8, #0
vmov.i32 q9, #0
vmov.i32 q10, #0
vmov.i32 q11, #0
vmov.f32 q8, q5
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
vmov.i32 d8[0], r1
vmov.i32 d9[0], r2
vmov.i32 d14[0], r1
vmov.i32 d14[1], r2
mov lr, r6
L4LoopH:
mov r10, r5
@ -151,10 +175,18 @@ L4Loop:
add r1, r1, r8
bne L4LoopH
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
sub r3, r3, #4
vst1.32 {q8, q9}, [r0]!
vmov.i32 r1, d8[0]
vmov.i32 r2, d9[0]
vmov.i32 r1, d14[0]
vmov.i32 r2, d14[1]
vst1.32 {q10, q11}, [r0]!
add r1, r1, r12
cmp r3, #4
@ -168,7 +200,7 @@ cmp r3, #0
beq End
L1Loop:
vmov.i32 q0, #0
vmov.f32 q0, q5
mov lr, r6
mov r11, r1
mov r12, r2
@ -184,6 +216,8 @@ L1Loop:
add r1, r1, r8
bne L1LoopH
vmax.f32 q0, q0, q4
vmin.f32 q0, q0, q6
subs r3, r3, #1
vst1.32 {q0}, [r0]!
mov r2, r12
@ -203,6 +237,5 @@ bne LoopDY
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif
#endif

View File

@ -1,74 +0,0 @@
//
// MNNConvRunForUnitDepthWise.S
// MNN
//
// Created by MNN on 2019/02/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvRunForUnitDepthWise
//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
//Auto: r0:dst, r1:src, r2:weight, r3:fw
push {r4-r8, lr}
//Load from sp:
//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
mov r4, r3
ldr r5, [sp, #24]
ldr r6, [sp, #28]
ldr r7, [sp, #32]
ldr r8, [sp, #36]
cmp r4, #0
vmov.i32 q0, #0
beq UnitEnd
cmp r5, #0
beq UnitEnd
mov lr, #4
mul r6, lr, r6
mul r7, lr, r7
mul r8, lr, r8
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
mul lr, r4, r7
sub r8, r8, lr
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
mov lr, #16
mul lr, r4, lr
sub r6, r6, lr
UnitLoopH:
mov lr, r4
UnitLoopW:
vld1.32 {q1}, [r1], r7
vld1.32 {q2}, [r2]!
vmla.f32 q0, q1, q2
subs lr, lr, #1
bne UnitLoopW
subs r5, r5, #1
add r1, r1, r8
add r2, r2, r6
bne UnitLoopH
UnitEnd:
vst1.32 {q0}, [r0]
pop {r4-r8, pc}
#endif
#endif

View File

@ -0,0 +1,221 @@
//
// MNNDepthwiseConvFastKernel.S
// MNN
//
// Created by MNN on 2019/02/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __arm__
#ifndef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNDepthwiseConvFastKernel
//void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
//Auto Load:
//r0:dst, r1:src, r2:weight, r3:width
push {r4-r8, r10, r11, lr}
//Load From Sp
//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
ldr r4, [sp, #32]
ldr r5, [sp, #36]
ldr r6, [sp, #40]
ldr r7, [sp, #44]
ldr r8, [sp, #48]
ldr lr, [sp, #52]
ldr r10, [sp, #56]
ldr r11, [sp, #60]
ldr r12, [sp, #64] // bias
vld1.32 {q0}, [r12] // bias
ldr r12, [sp, #68] // min,max
vld1.32 {d2[0]}, [r12]!
vld1.32 {d2[1]}, [r12]
vpush {q4-q7}
vmov.f32 q5, q0 // bias
vdup.f32 q4, d2[0] // min
vdup.f32 q6, d2[1] // max
mov r12, #4
mul r4, r12, r4
mul r7, r12, r7
mul r8, r12, r8
mul r10, r12, r10
mul r11, r12, r11
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
mul r12, r5, r7
sub r8, r8, r12
LoopDY:
push {r0, r1, r3, r10, r11, lr}
L8:
cmp r3, #7
ble L4
L8Loop:
vmov.f32 q8, q5 // use bias to init
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
vmov.f32 q12, q5
vmov.f32 q13, q5
vmov.f32 q14, q5
vmov.f32 q15, q5
mov r12, r1
mov r4, r2
mov lr, r6
L8LoopH:
mov r10, r5
L8LoopW:
vld1.32 {q7}, [r2]!
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
subs r10, r10, #1
vmla.f32 q8, q0, q7
vmla.f32 q9, q1, q7
vmla.f32 q10, q2, q7
vmla.f32 q11, q3, q7
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]
vmla.f32 q12, q0, q7
vmla.f32 q13, q1, q7
vmla.f32 q14, q2, q7
vmla.f32 q15, q3, q7
sub r1, r1, #80
bne L8LoopW
L8LoopWEnd:
subs lr, lr, #1
add r1, r1, r8
bne L8LoopH
sub r3, r3, #8
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmax.f32 q12, q12, q4
vmax.f32 q13, q13, q4
vmax.f32 q14, q14, q4
vmax.f32 q15, q15, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
vmin.f32 q12, q12, q6
vmin.f32 q13, q13, q6
vmin.f32 q14, q14, q6
vmin.f32 q15, q15, q6
vst1.32 {q8, q9}, [r0]!
mov r1, r12
mov r2, r4
vst1.32 {q10, q11}, [r0]!
vst1.32 {q12, q13}, [r0]!
vst1.32 {q14, q15}, [r0]!
add r1, r1, #128
cmp r3, #8
bge L8Loop
L4:
cmp r3, #3
ble L1
L4Loop:
vmov.f32 q8, q5
vmov.f32 q9, q5
vmov.f32 q10, q5
vmov.f32 q11, q5
mov r12, r1
mov r4, r2
mov lr, r6
L4LoopH:
mov r10, r5
L4LoopW:
vld1.32 {q12}, [r2]!
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]
sub r1, r1, #16
subs r10, r10, #1
vmla.f32 q8, q12, q0
vmla.f32 q9, q12, q1
vmla.f32 q10, q12, q2
vmla.f32 q11, q12, q3
bne L4LoopW
subs lr, lr, #1
add r1, r1, r8
bne L4LoopH
vmax.f32 q8, q8, q4
vmax.f32 q9, q9, q4
vmax.f32 q10, q10, q4
vmax.f32 q11, q11, q4
vmin.f32 q8, q8, q6
vmin.f32 q9, q9, q6
vmin.f32 q10, q10, q6
vmin.f32 q11, q11, q6
sub r3, r3, #4
vst1.32 {q8, q9}, [r0]!
mov r1, r12
mov r2, r4
vst1.32 {q10, q11}, [r0]!
add r1, r1, #64
cmp r3, #4
bge L4Loop
L1:
cmp r3, #0
beq End
L1Loop:
vmov.f32 q0, q5
mov lr, r6
mov r11, r1
mov r12, r2
L1LoopH:
mov r10, r5
L1LoopW:
vld1.32 {q1}, [r1]!
vld1.32 {q2}, [r2]!
vmla.f32 q0, q1, q2
subs r10, r10, #1
bne L1LoopW
subs lr, lr, #1
add r1, r1, r8
bne L1LoopH
vmax.f32 q0, q0, q4
vmin.f32 q0, q0, q6
subs r3, r3, #1
vst1.32 {q0}, [r0]!
mov r2, r12
add r1, r11, #16
bne L1Loop
End:
pop {r0, r1, r3, r10, r11, lr}
add r0, r0, r11
subs lr, lr, #1
add r1, r1, r10
bne LoopDY
vpop {q4-q7}
pop {r4-r8, r10, r11, pc}
#endif
#endif

View File

@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max
str r12, [sp, #16]
ldr r12, [r6, #12] // int8 min
str r12, [sp, #20]
ldr r12, [r6, #40] // blockNum
mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum
lsl r12, r12, #6 // weight_stride = src_depth_quad*LP*HP
lsl r12, r3, #6 // weight_stride = src_depth_quad*LP*HP
str r12, [sp, #24]
ldr r12, [r6, #48] // extraScale
str r12, [sp, #28]

View File

@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias
str r12, [sp, #8]
ldr r12, [r6, #36] // f32minmax
str r12, [sp, #12]
ldr r12, [r6, #40] // blockNum
mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum
lsl r12, r12, #5 // weight_stride = src_depth_quad*LP*HP
lsl r12, r3, #5 // weight_stride = src_depth_quad*LP*HP
str r12, [sp, #16]
ldr r12, [r6, #48] // extraScale
str r12, [sp, #20]
@ -82,12 +80,14 @@ L2LoopDz:
subs r12, r3, #1
// first four output
vld1.8 {q2}, [r1]!
vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
// int4->int8
vmov.i8 q5, #15
vand.i8 q5, q5, q4
vmov.i8 q6, #15
vmov.i8 q7, #15
vand.i8 q6, q6, q4
vand.i8 q7, q7, q5
vshr.u8 q4, q4, #4
vzip.8 q4, q5
vshr.u8 q5, q5, #4
vmull.s8 q0, d4, d8
vmull.s8 q1, d4, d10
@ -95,12 +95,6 @@ L2LoopDz:
vmlal.s8 q1, d5, d11
vpaddl.s16 q8, q0
vpaddl.s16 q9, q1
vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15
// int4->int8
vmov.i8 q7, #15
vand.i8 q7, q7, q6
vshr.u8 q6, q6, #4
vzip.8 q6, q7
vmull.s8 q0, d4, d12
vmull.s8 q1, d4, d14
@ -129,22 +123,18 @@ L2LoopDz:
L2LoopSz:
// first four output
vld1.8 {q2}, [r1]!
vld1.8 {q4}, [r2]!
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
// int4->int8
vmov.i8 q5, #15
vand.i8 q5, q5, q4
vmov.i8 q6, #15
vmov.i8 q7, #15
vand.i8 q6, q6, q4
vand.i8 q7, q7, q5
vshr.u8 q4, q4, #4
vzip.8 q4, q5
vshr.u8 q5, q5, #4
vmull.s8 q0, d4, d8
vmull.s8 q1, d4, d10
vmlal.s8 q0, d5, d9
vmlal.s8 q1, d5, d11
vld1.8 {q6}, [r2]!
// int4->int8
vmov.i8 q7, #15
vand.i8 q7, q7, q6
vshr.u8 q6, q6, #4
vzip.8 q6, q7
vpadal.s16 q8, q0
vpadal.s16 q9, q1
@ -269,12 +259,14 @@ L1LoopDz:
subs r12, r3, #1
// first four output
vld1.8 {q2}, [r1]!
vld1.8 {q4}, [r2]!
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
// int4->int8
vmov.i8 q5, #15
vand.i8 q5, q5, q4
vmov.i8 q6, #15
vmov.i8 q7, #15
vand.i8 q6, q6, q4
vand.i8 q7, q7, q5
vshr.u8 q4, q4, #4
vzip.8 q4, q5
vshr.u8 q5, q5, #4
vmull.s8 q0, d4, d8
vmull.s8 q1, d4, d10
@ -282,12 +274,6 @@ L1LoopDz:
vmlal.s8 q1, d5, d11
vpaddl.s16 q8, q0
vpaddl.s16 q9, q1
vld1.8 {q6}, [r2]!
// int4->int8
vmov.i8 q7, #15
vand.i8 q7, q7, q6
vshr.u8 q6, q6, #4
vzip.8 q6, q7
vmull.s8 q0, d4, d12
vmull.s8 q1, d4, d14
@ -302,22 +288,18 @@ L1LoopDz:
L1LoopSz:
// first four output
vld1.8 {q2}, [r1]!
vld1.8 {q4}, [r2]!
vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
// int4->int8
vmov.i8 q5, #15
vand.i8 q5, q5, q4
vmov.i8 q6, #15
vmov.i8 q7, #15
vand.i8 q6, q6, q4
vand.i8 q7, q7, q5
vshr.u8 q4, q4, #4
vzip.8 q4, q5
vshr.u8 q5, q5, #4
vmull.s8 q0, d4, d8
vmull.s8 q1, d4, d10
vmlal.s8 q0, d5, d9
vmlal.s8 q1, d5, d11
vld1.8 {q6}, [r2]!
// int4->int8
vmov.i8 q7, #15
vand.i8 q7, q7, q6
vshr.u8 q6, q6, #4
vzip.8 q6, q7
vpadal.s16 q8, q0
vpadal.s16 q9, q1

View File

@ -26,6 +26,12 @@ ldr x8, [sp, #0]
ldr x15, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
stp d8, d9, [sp, #(-16 * 3)]!
stp d10, d11, [sp, #(16 * 2)]
stp x19, x20, [sp, #(16 * 1)]
mov x9, #4
mul x4, x9, x4
@ -34,10 +40,32 @@ mul x8, x9, x8
mul x10, x9, x10
mul x11, x9, x11
ld1 {v8.4s}, [x12] // bias
ld1r {v10.4s}, [x13], #4 // min
ld1r {v11.4s}, [x13]
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
mul x9, x5, x7
sub x8, x8, x9
.macro assign_bias x0, x1, x2, x3
mov \x0\().16b, v8.16b
mov \x1\().16b, v8.16b
mov \x2\().16b, v8.16b
mov \x3\().16b, v8.16b
.endm
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
fmax \x0\().4s, \x0\().4s, \xmin\().4s
fmax \x1\().4s, \x1\().4s, \xmin\().4s
fmax \x2\().4s, \x2\().4s, \xmin\().4s
fmax \x3\().4s, \x3\().4s, \xmin\().4s
fmin \x0\().4s, \x0\().4s, \xmax\().4s
fmin \x1\().4s, \x1\().4s, \xmax\().4s
fmin \x2\().4s, \x2\().4s, \xmax\().4s
fmin \x3\().4s, \x3\().4s, \xmax\().4s
.endm
LoopDY:
mov v4.d[0], x10
mov v4.d[1], x11
@ -53,22 +81,10 @@ mov x12, #16
mul x12, x4, x12
L16Loop:
movi v16.4s, #0
movi v17.4s, #0
movi v18.4s, #0
movi v19.4s, #0
movi v20.4s, #0
movi v21.4s, #0
movi v22.4s, #0
movi v23.4s, #0
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
movi v28.4s, #0
movi v29.4s, #0
movi v30.4s, #0
movi v31.4s, #0
assign_bias v16, v17, v18, v19
assign_bias v20, v21, v22, v23
assign_bias v24, v25, v26, v27
assign_bias v28, v29, v30, v31
mov x13, x1
mov x14, x2
@ -120,6 +136,10 @@ L16Loop:
bne L16LoopH
sub x3, x3, #16
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
compare_min_max v24, v25, v26, v27, v10, v11
compare_min_max v28, v29, v30, v31, v10, v11
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x12
cmp x3, #16
@ -138,14 +158,8 @@ mov x12, #8
mul x12, x4, x12
L8Loop:
movi v16.4s, #0
movi v17.4s, #0
movi v18.4s, #0
movi v19.4s, #0
movi v20.4s, #0
movi v21.4s, #0
movi v22.4s, #0
movi v23.4s, #0
assign_bias v16, v17, v18, v19
assign_bias v20, v21, v22, v23
mov x13, x1
mov x14, x2
@ -180,6 +194,8 @@ L8Loop:
add x1, x1, x8
bne L8LoopH
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
sub x3, x3, #8
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x12
@ -195,10 +211,7 @@ mov x12, #4
mul x12, x4, x12
L4Loop:
movi v16.4s, #0
movi v17.4s, #0
movi v18.4s, #0
movi v19.4s, #0
assign_bias v16, v17, v18, v19
mov x13, x1
mov x14, x2
@ -225,6 +238,7 @@ L4Loop:
add x1, x1, x8
bne L4LoopH
compare_min_max v16, v17, v18, v19, v10, v11
sub x3, x3, #4
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x12
@ -235,7 +249,7 @@ cmp x3, #0
beq End
L1Loop:
movi v0.4s, #0
mov v0.16b, v8.16b
mov x9, x6
mov x11, x1
mov x12, x2
@ -252,6 +266,8 @@ L1Loop:
bne L1LoopH
subs x3, x3, #1
fmax v0.4s, v0.4s, v10.4s
fmin v0.4s, v0.4s, v11.4s
st1 {v0.4s}, [x0], #16
mov x2, x12
add x1, x11, x4
@ -271,7 +287,9 @@ add x0, x0, x11
add x1, x1, x10
bne LoopDY
ldp x19, x20, [sp, #(16 * 1)]
ldp d10, d11, [sp, #(16 * 2)]
ldp d8, d9, [sp], #(16 * 3)
ret
//MNNConvRunForLineDepthwise End

View File

@ -1,63 +0,0 @@
//
// MNNConvRunForUnitDepthWise.S
// MNN
//
// Created by MNN on 2019/02/04.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNConvRunForUnitDepthWise
//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
//Auto: x0:dst, x1:src, x2:weight, x3:fw
//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
cmp x3, #0
movi v0.4s, #0
beq UnitEnd
cmp x4, #0
beq UnitEnd
mov x9, #4
mul x5, x9, x5
mul x6, x9, x6
mul x7, x9, x7
//dilate_y_step -> dilate_y_step - dilate_x_step*fw
mul x9, x3, x6
sub x7, x7, x9
//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
mov x9, #16
mul x9, x3, x9
sub x5, x5, x9
UnitLoopH:
mov x9, x3
UnitLoopW:
ld1 {v1.4s}, [x1], x6
ld1 {v2.4s}, [x2], #16
fmla v0.4s, v1.4s, v2.4s
subs x9, x9, #1
bne UnitLoopW
subs x4, x4, #1
add x1, x1, x7
add x2, x2, x5
bne UnitLoopH
UnitEnd:
st1 {v0.4s}, [x0]
ret
#endif

View File

@ -0,0 +1,292 @@
//
// MNNDepthwiseConvFastKernel.S
// MNN
//
// Created by MNN on 2024/09/18.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifdef __aarch64__
#include "MNNAsmGlobal.h"
.text
.align 5
asm_function MNNDepthwiseConvFastKernel
// void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
// size_t srcHStep, size_t dstHStep);
//Auto Load:
//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
//Load From sp:
//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
ldr x8, [sp, #0]
ldr x15, [sp, #8]
ldr x10, [sp, #16]
ldr x11, [sp, #24]
ldr x12, [sp, #32]
ldr x13, [sp, #40]
stp d14, d15, [sp, #(-16 * 9)]!
stp d12, d13, [sp, #(16 * 1)]
stp d10, d11, [sp, #(16 * 2)]
stp d8, d9, [sp, #(16 * 3)]
stp x21, x22, [sp, #(16 * 4)]
stp x19, x20, [sp, #(16 * 5)]
stp x27, x28, [sp, #(16 * 6)]
stp x25, x26, [sp, #(16 * 7)]
stp x23, x24, [sp, #(16 * 8)]
lsl x4, x4, #2 // src_w_step*sizeof(float)
lsl x7, x7, #2 // dilate_x_step*sizeof(float)
lsl x8, x8, #2 // dilate_y_step*sizeof(float)
lsl x23, x10, #2 // srcHStep*sizeof(float)
lsl x24, x11, #2 // dstHStep*sizeof(float)
mov x20, x12 // bias
mov x26, x13 // min
add x27, x13, #4 // max
//dilate_y_step -> dilate_y_step - fw*dilate_x_step
mul x9, x5, x7
sub x8, x8, x9
mov x25, x3 // width
.macro assign_bias x0, x1, x2, x3, bv
mov \x0\().16b, \bv\().16b
mov \x1\().16b, \bv\().16b
mov \x2\().16b, \bv\().16b
mov \x3\().16b, \bv\().16b
.endm
.macro compare_min_max x0, x1, x2, x3, xmin, xmax
fmax \x0\().4s, \x0\().4s, \xmin\().4s
fmax \x1\().4s, \x1\().4s, \xmin\().4s
fmax \x2\().4s, \x2\().4s, \xmin\().4s
fmax \x3\().4s, \x3\().4s, \xmin\().4s
fmin \x0\().4s, \x0\().4s, \xmax\().4s
fmin \x1\().4s, \x1\().4s, \xmax\().4s
fmin \x2\().4s, \x2\().4s, \xmax\().4s
fmin \x3\().4s, \x3\().4s, \xmax\().4s
.endm
LoopDY:
//mov x23, x10
//mov x24, x11
mov x21, x0
mov x22, x1
L16:
cmp x3, #16
blt L8
mov x12, #-176
mov x19, #256
L16Loop:
ld1 {v8.4s}, [x20] // load bias
assign_bias v16, v17, v18, v19, v8
assign_bias v20, v21, v22, v23, v8
assign_bias v24, v25, v26, v27, v8
assign_bias v28, v29, v30, v31, v8
mov x13, x1
mov x14, x2
mov x9, x6
L16LoopH:
mov x10, x5
L16LoopW:
ld1 {v8.4s}, [x2], #16
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64
subs x10, x10, #1
fmla v16.4s, v8.4s, v0.4s
fmla v17.4s, v8.4s, v1.4s
fmla v18.4s, v8.4s, v2.4s
fmla v19.4s, v8.4s, v3.4s
fmla v20.4s, v8.4s, v4.4s
fmla v21.4s, v8.4s, v5.4s
fmla v22.4s, v8.4s, v6.4s
fmla v23.4s, v8.4s, v7.4s
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
fmla v24.4s, v8.4s, v9.4s
fmla v25.4s, v8.4s, v10.4s
fmla v26.4s, v8.4s, v11.4s
fmla v27.4s, v8.4s, v12.4s
fmla v28.4s, v8.4s, v0.4s
fmla v29.4s, v8.4s, v1.4s
fmla v30.4s, v8.4s, v2.4s
fmla v31.4s, v8.4s, v3.4s
bne L16LoopW
subs x9, x9, #1
add x1, x1, x8
bne L16LoopH
ld1r {v10.4s}, [x26] // min
ld1r {v11.4s}, [x27] // max
sub x3, x3, #16
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
compare_min_max v24, v25, v26, v27, v10, v11
compare_min_max v28, v29, v30, v31, v10, v11
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x19 // 16 * pack * sizeof(float)
cmp x3, #16
mov x2, x14
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
bge L16Loop
L8:
ld1r {v10.4s}, [x26] // min
ld1r {v11.4s}, [x27] // max
ld1 {v24.4s}, [x20] // load bias
cmp x3, #7
ble L4
mov x12, #-48
mov x19, #128
L8Loop:
assign_bias v16, v17, v18, v19, v24
assign_bias v20, v21, v22, v23, v24
mov x13, x1
mov x14, x2
mov x9, x6
L8LoopH:
mov x10, x5
L8LoopW:
ld1 {v8.4s}, [x2], #16
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12
subs x10, x10, #1
fmla v16.4s, v8.4s, v0.4s
fmla v17.4s, v8.4s, v1.4s
fmla v18.4s, v8.4s, v2.4s
fmla v19.4s, v8.4s, v3.4s
fmla v20.4s, v8.4s, v4.4s
fmla v21.4s, v8.4s, v5.4s
fmla v22.4s, v8.4s, v6.4s
fmla v23.4s, v8.4s, v7.4s
bne L8LoopW
subs x9, x9, #1
add x1, x1, x8
bne L8LoopH
compare_min_max v16, v17, v18, v19, v10, v11
compare_min_max v20, v21, v22, v23, v10, v11
sub x3, x3, #8
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x19 // 8 * pack * sizeof(float)
mov x2, x14
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
L4:
cmp x3, #4
ble L1
mov x12, #16
mov x19, #64
L4Loop:
assign_bias v16, v17, v18, v19, v24
mov x13, x1
mov x14, x2
mov x9, x6
L4LoopH:
mov x10, x5
L4LoopW:
ld1 {v8.4s}, [x2], #16
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
subs x10, x10, #1
fmla v16.4s, v8.4s, v0.4s
fmla v17.4s, v8.4s, v1.4s
fmla v18.4s, v8.4s, v2.4s
fmla v19.4s, v8.4s, v3.4s
bne L4LoopW
subs x9, x9, #1
add x1, x1, x8
bne L4LoopH
compare_min_max v16, v17, v18, v19, v10, v11
sub x3, x3, #4
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
add x1, x13, x19
mov x2, x14
L1:
cmp x3, #0
beq End
mov x19, #16
L1Loop:
ld1 {v16.4s}, [x20] // assign bias
mov x13, x1
mov x14, x2
mov x9, x6
L1LoopH:
mov x10, x5
L1LoopW:
ld1 {v8.4s}, [x2], #16
ld1 {v0.4s}, [x1], #16
subs x10, x10, #1
fmla v16.4s, v8.4s, v0.4s
bne L1LoopW
subs x9, x9, #1
add x1, x1, x8
bne L1LoopH
subs x3, x3, #1
fmax v16.4s, v16.4s, v10.4s
fmin v16.4s, v16.4s, v11.4s
st1 {v16.4s}, [x0], #16
add x1, x13, x4
mov x2, x14
bne L1Loop
End:
//mov x10, x23
//mov x11, x24
//mov x0, x21
//mov x1, x22
mov x3, x25
subs x15, x15, #1
add x0, x21, x24
add x1, x22, x23
bne LoopDY
ldp x23, x24, [sp, #(16 * 8)]
ldp x25, x26, [sp, #(16 * 7)]
ldp x27, x28, [sp, #(16 * 6)]
ldp x19, x20, [sp, #(16 * 5)]
ldp x21, x22, [sp, #(16 * 4)]
ldp d8, d9, [sp, #(16 * 3)]
ldp d10, d11, [sp, #(16 * 2)]
ldp d12, d13, [sp, #(16 * 1)]
ldp d14, d15, [sp], #(16 * 9)
ret
//MNNConvRunForLineDepthwise End
#endif

View File

@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)]
ldr x19, [x15, #56] // fp32 min max
ldr x21, [x15, #64] // blockNum
ldr x23, [x15, #80] // extraScale
mul x21, x21, x3 // blockNum * src_depth_quad_perblock
lsl x21, x21, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
lsl x21, x3, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
add x20, x19, #4
Start:

View File

@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)]
stp x25, x26, [sp, #(16 * 7)]
stp x23, x24, [sp, #(16 * 8)]
ldr x27, [x6, #64] // blockNum
mul x27, x27, x3 // blockNum * src_depth_quad_perblock
lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
ldr w28, [x6, #24] // useInt8
ldr x25, [x6, #40] // xKernelSum

View File

@ -138,9 +138,7 @@ ldr w23, [x6, #24]
ldr x27, [x6, #40] // srcKernelSum
ldr x28, [x6, #48] // weightQuanBias
ldr x22, [x6, #64] // blockNum
mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
ldr x10, [x6, #80] // extra scale
mov x21, #4 // sizeof(int8_t) * pack

View File

@ -55,8 +55,7 @@ mov x9, x6 // blockNum
cbnz x12, TILE10_BLOCK_NUM
ld1 {v5.4s, v6.4s}, [x2], #32
ld1 {v7.d}[0], [x2]
sub x2, x2, #32
ld1 {v7.d}[0], [x2], #8
TILE10_BLOCK_NUM:
cbz x9, TILE10_END

View File

@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)]
stp x23, x24, [sp, #(16 * 6)]
ldr x19, [x15, #56] // fp32 min max
ldr x21, [x15, #64] // blockNum
ldr x23, [x15, #80] // extraScale
mul x21, x21, x3 // blockNum * src_depth_quad_perblock
lsl x21, x21, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
lsl x21, x3, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
add x20, x19, #4
Start:

View File

@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)]
stp x25, x26, [sp, #(16 * 7)]
stp x23, x24, [sp, #(16 * 8)]
ldr x27, [x6, #64] // blockNum
mul x27, x27, x3 // blockNum * src_depth_quad_perblock
lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
ldr x25, [x6, #40] // xKernelSum
ldr x26, [x6, #48] // weightQuantBias

View File

@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)]
ldr x27, [x6, #40] // srcKernelSum
ldr x28, [x6, #48] // weightQuanBias
ldr x22, [x6, #64] // blockNum
mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
mov x21, #16 // sizeof(float) * pack
ldr x14, [x6, #56] // float32 maxmin ptr

View File

@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
#endif
}
void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
int unit = ow / 2;
MNN_ASSERT(cacheLineSize >= 1);
auto biasF = Vec4::load(bias);
auto minF = Vec4(parameters[2]);
auto maxF = Vec4(parameters[3]);
for (int x = 0; x < unit; ++x) {
auto offset = 4 * 4 * x;
int i = 0;
Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
for (i = 1; i < cacheLineSize; ++i) {
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
}
auto o0 = m0 + m1 + m2 + biasF;
auto o1 = m1 - m2 + m3 + biasF;
o0 = Vec4::min(maxF, o0);
o1 = Vec4::min(maxF, o1);
o0 = Vec4::max(minF, o0);
o1 = Vec4::max(minF, o1);
Vec4::save(dest + 8 * x + 0 * 4, o0);
Vec4::save(dest + 8 * x + 1 * 4, o1);
}
if (unit * 2 < ow) {
auto offset = 4 * 4 * unit;
int i = 0;
Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
for (i = 1; i < cacheLineSize; ++i) {
m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
}
auto o0 = m0 + m1 + m2 + biasF;
o0 = Vec4::min(maxF, o0);
o0 = Vec4::max(minF, o0);
Vec4::save(dest + 8 * unit + 0 * 4, o0);
}
}
extern "C" {
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
}
void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
for (int x = 0; x < su; ++x) {
auto dstX = dest + 4 * 4 * x;
auto sx = x * 2 - (int)pad;
auto ex = sx + 4;
auto clampSx = std::max(sx, 0);
auto clampEx = std::min(ex, (int)iw);
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int i = clampSx; i < clampEx; ++i) {
v[i - sx] = Vec4::load(source + 4 * i);
}
auto m0 = v[0] - v[2];
auto m1 = v[1] + v[2];
auto m2 = v[2] - v[1];
auto m3 = v[3] - v[1];
Vec4::save(dstX + 4 * 0, m0);
Vec4::save(dstX + 4 * 1, m1);
Vec4::save(dstX + 4 * 2, m2);
Vec4::save(dstX + 4 * 3, m3);
}
MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
for (int x = eu; x < unit; ++x) {
auto dstX = dest + 4 * 4 * x;
auto sx = x * 2 - (int)pad;
auto ex = sx + 4;
auto clampSx = std::max(sx, 0);
auto clampEx = std::min(ex, (int)iw);
Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int i = clampSx; i < clampEx; ++i) {
v[i - sx] = Vec4::load(source + 4 * i);
}
auto m0 = v[0] - v[2];
auto m1 = v[1] + v[2];
auto m2 = v[2] - v[1];
auto m3 = v[3] - v[1];
Vec4::save(dstX + 4 * 0, m0);
Vec4::save(dstX + 4 * 1, m1);
Vec4::save(dstX + 4 * 2, m2);
Vec4::save(dstX + 4 * 3, m3);
}
}
#ifndef MNN_USE_NEON
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
int unit = ow / 2;
auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
auto biasF = Vec4::load(bias);
auto minF = Vec4(parameters[2]);
auto maxF = Vec4(parameters[3]);
for (int x = 0; x < unit; ++x) {
auto offset = 4 * 4 * x;
int i = 0;
Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
auto o0 = m0 + m1 + m2 + biasF;
auto o1 = m1 - m2 + m3 + biasF;
o0 = Vec4::min(maxF, o0);
o1 = Vec4::min(maxF, o1);
o0 = Vec4::max(minF, o0);
o1 = Vec4::max(minF, o1);
Vec4::save(dest + 8 * x + 0 * 4, o0);
Vec4::save(dest + 8 * x + 1 * 4, o1);
}
if (unit * 2 < ow) {
auto offset = 4 * 4 * unit;
Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
auto o0 = m0 + m1 + m2 + biasF;
o0 = Vec4::min(maxF, o0);
o0 = Vec4::max(minF, o0);
Vec4::save(dest + 8 * unit + 0 * 4, o0);
}
}
void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
if (unit <= 0) {
return;
}
Vec4 v0 = Vec4::load(source + 4 * 0);
Vec4 v1 = Vec4::load(source + 4 * 1);
Vec4 v2;
Vec4 v3;
source += 8;
for (int x = 0; x < unit; ++x) {
v2 = Vec4::load(source + 0 * 4);
v3 = Vec4::load(source + 1 * 4);
auto m0 = v0 - v2;
auto m1 = v1 + v2;
auto m2 = v2 - v1;
auto m3 = v3 - v1;
Vec4::save(dest + 4 * 0, m0);
Vec4::save(dest + 4 * 1, m1);
Vec4::save(dest + 4 * 2, m2);
Vec4::save(dest + 4 * 3, m3);
source += 8;
dest += 16;
v0 = v2;
v1 = v3;
}
}
#endif
static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
if(sparseBlockOC == 4) {
packedSparseMatMul = MNNPackedSparseMatMulEpx4;
@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() {
gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
gCoreFunction->MNNMatrixSub = MNNMatrixSub;
gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() {
gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
#ifdef MNN_USE_NEON
gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel;
#endif
gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;

View File

@ -170,9 +170,6 @@ struct MatMulParam {
void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter);
void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
struct SumByAxisParams {
@ -267,15 +264,10 @@ struct CoreFunctions {
void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
// NC4HW4's compute function
void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep);
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post);
void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post);
void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
size_t bStride, size_t height);
void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
@ -309,6 +301,9 @@ struct CoreFunctions {
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr;
void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,

View File

@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
return NO_ERROR;
}
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) {
auto weightDst = weight->host<uint8_t>();
memset(weightDst, 0, weight->size());
if (SRC_UNIT > pack) {
int kernelCountUnit = weight->shape()[1];
int blockL = kernelCountUnit / blockNum;
int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL;
int strideInside = weight->stride(0) / blockNum;
if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT};
auto icDivU = UP_DIV(ic, pack);
for (int k = 0; k < kernelCount; ++k) {
const auto srcK = weightSrc + k;
@ -58,18 +62,21 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
const int ySubOutSide = yIndex / (SRC_UNIT / pack);
const int ySubInSide = yIndex % (SRC_UNIT / pack);
auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
int blockId = ySubOutSide / blockL;
int blockInsideId = ySubOutSide % blockL;
auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
const int xInSide = x % UNIT;
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
const int srcIndex = x * kernelCount * ic;
dstY[dstIndex] = srcY[srcIndex];
}
}
}
} else {
} else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT};
for (int k = 0; k < kernelCount; ++k) {
auto icDivU = UP_DIV(ic, SRC_UNIT);
const auto srcK = weightSrc + k;
@ -77,12 +84,15 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
const int yOutSide = y / SRC_UNIT;
const int yInSide = y % SRC_UNIT;
auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide;
int blockId = (yOutSide + k * icDivU) / blockL;
int blockInsideId = (yOutSide + k * icDivU) % blockL;
auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide;
const auto srcY = srcK + y * kernelCount;
for (int x = 0; x < oc; ++x) {
const int xOutSide = x / UNIT;
const int xInSide = x % UNIT;
const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
const int srcIndex = x * kernelCount * ic;
dstY[dstIndex] = srcY[srcIndex];
}
@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
const std::shared_ptr<Tensor>& weightOrigin,
std::shared_ptr<Tensor>& weight) {
std::shared_ptr<Tensor>& weight, int blockNum) {
MNN_ASSERT(blockNum > 0);
auto core = static_cast<CPUBackend*>(bn)->int8Functions();
auto gcore = static_cast<CPUBackend*>(bn)->functions();
int UNIT, SRC_UNIT, DST_XUNIT;
@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
MNN_ERROR("Memory not enough");
return false;
}
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum);
return true;
}
static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) {
// common parameters
int outputCount = conv2d->common()->outputCount();
auto core = static_cast<CPUBackend*>(backend)->functions();
@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
dequantCnt /= 2;
}
int blockNum = dequantCnt / outputCount;
blocknumPtr[0] = blockNum;
int scaleSize = blockNum * ocUp4; // pack size.
int blockSize = LSize / blockNum;
int originOffset = 0;
@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
auto gcore = static_cast<CPUBackend*>(backend)->functions();
mResourceInt8.reset(new CPUConvolution::ResourceInt8);
mResourceInt8->mDynamicQuant = true;
GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
int blockNum = 1;
GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum);
mBlockNum = blockNum;
// dynamic quant
int UNIT, SRC_UNIT, DST_XUNIT;
core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
// Pack two int4-weight to one int8-weight.
int cnt = lP * hP / 4;
int L = lU * lP;
int blockL = lU / blockNum;
int stride0 = (lP * hP) * hU * blockL;
int stride1 = (lP * hP) * blockL;
for (int i = 0; i < hU; ++i) {
for (int j = 0; j < lU; ++j) {
int blockId = j / blockL;
int blockkInsideId = j % blockL;
for (int k = 0; k < cnt; ++k) {
int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k);
int hpId0 = (2 * k + 1) / lP;
int lpId0 = (2 * k) % lP;
@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
tmpWeight[2 * i + 1] = s1;
}
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
if(!mValid) {
return;
}
@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
mResourceInt8->mWeightInt8 = weightLow;
} else {
std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
if(!mValid) {
return;
}
@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
auto convOp = op->main_as_Convolution2D();
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum);
if(!mValid) {
return;
}
@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1);
for (int i = 0; i < mDivides.size(); ++i) {
mDivides[i] *= part;
}
@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
mThreadNums = ALIMIN(threads, mTileCount);
mDivides.resize(threads+1);
mDivides[0] = 0;
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1);
}
int ocUp4 = ROUND_UP(outC, gcore->pack);
// int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
auto inputDataPtr = input->host<int8_t>();
auto im2colPtr = mTempIm2ColBuffer->host<int8_t>();
if (SRC_UNIT > PackUnit) {
memset(im2colPtr, 0, mTempIm2ColBuffer->size());
}
const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
auto srcKernelSumPtr = mTempSrcSum.data();
auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
dequantscale = range / 255.0f;
zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
}
std::vector<float>qsVec(PackUnit, quantscale);
auto sizeDiv = UP_DIV(inputsize, PackUnit);
int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
const auto weightDequanBiasTid = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes);
const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * blockL * SRC_UNIT * weightBytes);
if (mBlockNum == 1) {
quanParam.biasFloat = biasFloatTid;
quanParam.scale = scaleFloatTid;
@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
}
ptrX += (step * mBlockNum);
realDstCount-=step;

View File

@ -24,7 +24,7 @@ public:
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1);
protected:
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
@ -74,7 +74,7 @@ private:
std::vector<int32_t> mDivides;
int mThreadNums;
int mBlockNum;
int mBlockNum = 1;
int mOcPerThread;
bool mSplitByOc;
bool mUseBatchQuan;

View File

@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
int dx, fx, fy;
auto biasValue = Vec4::load(bias);
auto minF = Vec4(parameters[0]);
auto maxF = Vec4(parameters[1]);
for (int y = 0; y < height; ++y) {
auto srcY = src + y * srcHStep;
auto dstY = dst + y * dstHStep;
for (dx = 0; dx < width; ++dx) {
float* dst_x = dstY + dx * 4;
Vec4 dstValue(0.0f);
auto dstValue = biasValue;
const float* src_z = srcY + src_w_setup * dx;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
}
}
dstValue = Vec4::min(dstValue, maxF);
dstValue = Vec4::max(dstValue, minF);
Vec4::save(dst_x, dstValue);
}
}
}
void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
Vec4 dstValue(0.0f);
const float* src_z = src;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = src_z + fy * dilateY_step;
const float* weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
const float* weight_x = weight_y + 4 * fx;
const float* src_x = src_y + fx * dilateX_step;
dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
}
}
Vec4::save(dst, dstValue);
}
void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
size_t dilateX_step, size_t dilateY_step, float* alpha) {

View File

@ -16,17 +16,19 @@
extern "C" {
#endif
void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep);
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
size_t bStride, size_t height);
void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,

View File

@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
}
#endif
mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
std::vector<int> divides(numberThread+1);
divides[0] = 0;
rt->computeDivideSizes(matrixSizeE, divides.data()+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1);
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int planeStart = divides[i];
@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
auto ocDiv = UP_DIV(ocC4, hDiv);
std::vector<int> divides(numberThread+1);
divides[0] = 0;
rt->computeDivideSizes(ocDiv, divides.data()+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1);
mUnits.resize(numberThread);
for (int i = 0; i < numberThread; ++i) {
int ocStart = divides[i] * hDiv;

View File

@ -1,221 +0,0 @@
//
// ConvolutionDepthwise3x3.cpp
// MNN
//
// Created by MNN on 2019/4/3.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "CommonOptFunction.h"
#include "core/Concurrency.h"
#include "core/Macro.h"
namespace MNN {
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
mResource = resource;
}
ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
const float *originWeight, size_t originWeightSize, const float *bias,
size_t biasSize)
: CPUConvolution(common, b) {
MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
mResource.reset(new Resource);
mResource->backend = b;
auto core = static_cast<CPUBackend*>(b)->functions();
auto pack = core->pack;
auto bytes = core->bytes;
auto success = mResource->copyBiasAlign(bias, biasSize);
if (!success) {
mValid = false;
return;
}
auto channel = common->outputCount();
auto channelC4 = UP_DIV(channel, pack);
auto unitSize = channelC4 * pack * 3 * 4;
mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
if (!mValid) {
return;
}
AutoStorage<float> tempWeightStorge;
auto weightHost = mResource->mWeight->host<float>();
if (bytes < 4) {
// Lowp need extra float storage for transform
tempWeightStorge.reset(unitSize);
if (nullptr == tempWeightStorge.get()) {
mValid = false;
return;
}
weightHost = tempWeightStorge.get();
}
::memset(weightHost, 0, unitSize * sizeof(float));
/* 1D-Winograd F(2,3) and tiling */
for (int c = 0; c < channel; ++c) {
auto cIndex = c / pack;
auto cRemain = c % pack;
auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
auto weightSrcZ = originWeight + c * 9;
for (int y = 0; y < 3; ++y) {
auto k0 = weightSrcZ[3 * y + 0];
auto k1 = weightSrcZ[3 * y + 1];
auto k2 = weightSrcZ[3 * y + 2];
auto m0 = k0;
auto m1 = 0.5f * (k0 + k1 + k2);
auto m2 = 0.5f * (k0 - k1 + k2);
auto m3 = k2;
weightDstZ[(y * 4 + 0) * pack] = m0;
weightDstZ[(y * 4 + 1) * pack] = m1;
weightDstZ[(y * 4 + 2) * pack] = m2;
weightDstZ[(y * 4 + 3) * pack] = m3;
}
}
if (bytes < 4) {
core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
}
}
ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
// Do nothing
}
bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
if (nullptr == dst) {
return true;
}
auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
*dst = dstExe;
return true;
}
ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
CPUConvolution::onResize(inputs, outputs);
const int numberThread = ((CPUBackend *)backend())->threadNumber();
auto output = outputs[0];
auto owUnit = UP_DIV(output->width(), 2);
auto core = static_cast<CPUBackend*>(backend())->functions();
// 3 cacheline
mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
if (!valid) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
auto iw = inputs[0]->width();
mSourceStartX = UP_DIV(mPadX, 2);
mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX);
mPostParameters = getPostParameters();
// auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
// FUNC_PRINT_ALL(rate, f);
int channelC4 = UP_DIV(inputs[0]->channel(), core->pack);
int batch = inputs[0]->batch();
auto total = channelC4 * batch;
mDivides.resize(numberThread+1);
mDivides[0] = 0;
static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1);
return NO_ERROR;
}
ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs) {
auto input = inputs[0];
auto output = outputs[0];
auto core = static_cast<CPUBackend*>(backend())->functions();
int channelC4 = UP_DIV(input->channel(), core->pack);
int initSize = std::min(input->height(), 2);
int batch = input->batch();
int ow = output->width();
int oh = output->height();
int owUnit = UP_DIV(ow, 2);
auto iw = input->width();
auto ih = input->height();
auto kernelOrigin = mResource->mWeight->host<uint8_t>();
/*oy-mPadY>=0*/
int middelYStart = mPadY;
/*oy-mPadY+3-1 < ih*/
int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
int threadNumber = ((CPUBackend *)backend())->threadNumber();
auto maxKernelH = std::min(mPadY + ih, 3);
auto inputOrigin = input->host<uint8_t>();
auto outputOrigin = output->host<uint8_t>();
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) {
int z = index / batch;
auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
auto inputZ = inputOrigin + core->pack * index * iw * ih * core->bytes;
auto outputZ = outputOrigin + core->pack * index * ow * oh * core->bytes;
auto kernelZ = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
// Init
for (int i = 0; i < initSize; ++i) {
core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
mSourceEndX);
}
// Compute Top
for (int y = 0; y < middelYStart; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
int cacheLineSize = y - mPadY + maxKernelH;
if (cacheLineSize <= 0) {
::memset(outputY, 0, core->bytes * ow * core->pack);
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
continue;
}
auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
cacheLineSize = std::min(cacheLineSize, ih);
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
}
// Compute Mid
for (int y = middelYStart; y < middelYEnd; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
auto iy = y - mPadY + 2;
core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
mSourceEndX);
// FUNC_PRINT(ow);
core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
auto temp = cacheLine[0];
cacheLine[0] = cacheLine[1];
cacheLine[1] = cacheLine[2];
cacheLine[2] = temp;
}
// Compute Bottom
for (int y = middelYEnd; y < oh; ++y) {
auto outputY = outputZ + y * core->bytes * core->pack * ow;
int cacheLineSize = (ih - y + mPadY);
if (cacheLineSize <= 0) {
::memset(outputY, 0, ow * core->bytes * core->pack);
core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data());
continue;
}
core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
cacheLine[0] = cacheLine[1];
cacheLine[1] = cacheLine[2];
}
}
} MNN_CONCURRENCY_END();
return NO_ERROR;
}
} // namespace MNN

View File

@ -1,37 +0,0 @@
//
// ConvolutionDepthwise3x3.hpp
// MNN
//
// Created by MNN on 2019/4/3.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef ConvolutionDepthwise3x3_hpp
#define ConvolutionDepthwise3x3_hpp
#include "backend/cpu/CPUConvolution.hpp"
namespace MNN {
class ConvolutionDepthwise3x3 : public CPUConvolution {
public:
ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight,
size_t originWeightSize, const float *bias, size_t biasSize);
virtual ~ConvolutionDepthwise3x3();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
private:
ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
std::shared_ptr<Resource> mResource;
std::unique_ptr<Tensor> mCacheLine;
int mSourceStartX = 0;
int mSourceEndX = 0;
std::vector<float> mPostParameters;
std::vector<int> mDivides;
};
} // namespace MNN
#endif /* ConvolutionDepthwise3x3_hpp */

View File

@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
// MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
std::vector<int> divides(threadNumber+1);
static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
divides[0] = 0;
auto midBuffer0Bytes = srcUnit2 * pack * bytes;
bool allow_x86_bf16_winograd = true;
@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
}
};
std::vector<int> postDivides(threadNumber+1);
static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
postDivides[0] = 0;
mPostFunction.first = threadNumber;

View File

@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
std::vector<int> ocC4ParralSize(threadNumber + 1);
ocC4ParralSize[0] = 0;
rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
mFunction.second = [=](int placeholder) {
const float* biasPtr = bias ? bias->host<float>() : nullptr;
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
}
info[0] = 1;
int hw4Stride = info[1] * unit * bytes;
rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
im2colParallelSize[0] = 0;
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
int threadEL[4];
@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
std::vector<int> divides(threadNumber + 1);
divides[0] = 0;
static_cast<const CPURuntime*>(static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1);
static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
mFunction.second = [=](int tId) {
const float* biasPtr = bias ? bias->host<float>() : nullptr;

View File

@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
const int bytes = ((post->useInt8 == 1) ? 1 : 4);
float fp32min = 0, fp32max = 0;
// if (0 == post->useInt8) {
// fp32min = (post->fp32minmax)[0];
// fp32max = (post->fp32minmax)[1];
// }
auto blockNum = post->blockNum;
int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
const auto srcSumPtr = post->srcKernelSum;
if (0 == post->useInt8 && post->fp32minmax) {
@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
uint32_t c = 0xf;
const int bytes = 4;
float fp32min = 0, fp32max = 0;
int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
MNN_ASSERT(post->useInt8==0);
if (post->fp32minmax) {
@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
}
float* biasPtr = (float*)post->biasFloat;
int blockNum = post->blockNum;
const auto srcSumPtr = post->srcKernelSum;
for (int dz = 0; dz < dst_depth_quad; ++dz) {

View File

@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
}
int blockNum = post->blockNum;
const float* biasPtr = nullptr;
if (post->biasFloat) {
biasPtr = post->biasFloat;
}
int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const __m128i mask = _mm_set1_epi8(0xf);
@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
}
int blockNum = post->blockNum;
const float* biasPtr = nullptr;
if (post->biasFloat) {
biasPtr = post->biasFloat;
@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
//printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
if (GEMMINT8_AVX2_E == realDst) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
auto dst_z = dst + dz * dst_step_tmp;
@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
}
if (3 == realDst) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
auto dst_z = dst + dz * dst_step_tmp;
@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
}
if (2 == realDst) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
auto dst_z = dst + dz * dst_step_tmp;
@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
}
if (1 == realDst) {
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
auto dst_z = dst + dz * dst_step_tmp;

View File

@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int
void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c
size_t length, size_t hSub);
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep);
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
}
@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si
}
}
void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
__m256 dstValue = _mm256_setzero_ps();
const float* src_z = src;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = src_z + fy * dilateY_step;
const float* weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
const float* weight_x = weight_y + PACK_UNIT * fx;
const float* src_x = src_y + fx * dilateX_step;
dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
}
}
_mm256_storeu_ps(dst, dstValue);
}
void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
int dx, fx, fy;
const int unit = 4;
int widthUnit = width / unit;
int widthRemain = width - widthUnit * unit;
const float* weight_z = weight;
auto minF = _mm256_broadcast_ss(parameters + 0);
auto maxF = _mm256_broadcast_ss(parameters + 1);
auto bv = _mm256_loadu_ps(bias);
for (int y = 0; y < height; ++y) {
auto srcY = src + y * srcHStep;
auto dstY = dst + y * dstHStep;
for (dx = 0; dx < widthUnit; ++dx) {
auto dstValue0 = _mm256_setzero_ps();
auto dstValue1 = _mm256_setzero_ps();
auto dstValue2 = _mm256_setzero_ps();
auto dstValue3 = _mm256_setzero_ps();
auto dstValue0 = bv;
auto dstValue1 = bv;
auto dstValue2 = bv;
auto dstValue3 = bv;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = srcY + fy * dilateY_step;
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
}
}
dstValue0 = _mm256_min_ps(dstValue0, maxF);
dstValue1 = _mm256_min_ps(dstValue1, maxF);
dstValue2 = _mm256_min_ps(dstValue2, maxF);
dstValue3 = _mm256_min_ps(dstValue3, maxF);
dstValue0 = _mm256_max_ps(dstValue0, minF);
dstValue1 = _mm256_max_ps(dstValue1, minF);
dstValue2 = _mm256_max_ps(dstValue2, minF);
dstValue3 = _mm256_max_ps(dstValue3, minF);
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
}
for (dx = 0; dx < widthRemain; ++dx) {
float* dst_x = dstY + dx * PACK_UNIT;
auto dstValue = _mm256_setzero_ps();
auto dstValue = bv;
const float* src_z = srcY + src_w_setup * dx;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
}
}
dstValue = _mm256_min_ps(dstValue, maxF);
dstValue = _mm256_max_ps(dstValue, minF);
_mm256_storeu_ps(dst_x, dstValue);
}
}
@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz
}
}
static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
if (padMode == true) { //padMode == BorderMode_ZEROS
if (h < 0 || h >= height || w < 0 || w >= width) {
return -1;
}
} else {
// Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
// For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
// the leftover reflections degrade to GridSamplePaddingMode_BORDER
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
}
return h * width * PACK_UNIT + w * PACK_UNIT;
}
void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
for (auto ow = 0; ow < outW; ++ow) {
auto w = cordPtr[2 * ow + 0];
auto h = cordPtr[2 * ow + 1];
__m256 interp;
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
int nh = ::floor(h + 0.5f);
int nw = ::floor(w + 0.5f);
size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
} else { //sampleMode == GridSampleMode_BILINEAR
int w0_h = ::floor(h);
int w0_w = ::floor(w);
int w1_h = ::ceil(h);
int w1_w = ::ceil(w);
auto oneV = _mm256_set1_ps(1.0f);
auto f0 = _mm256_set1_ps((float)w1_w - w);
auto f1 = _mm256_sub_ps(oneV, f0);
auto h0 = _mm256_set1_ps((float)w1_h - h);
auto h1 = _mm256_sub_ps(oneV, h0);
size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
__m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00);
__m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01);
__m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10);
__m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11);
__m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1));
__m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1));
interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1));
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
}
}
}
void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
Vec8 max = Vec8(-FLT_MAX);
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth,
return ((d * height + h) * width + w) * PACK_UNIT;
}
void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
for (auto ow = 0; ow < outW; ++ow) {
auto w = cordPtr[3 * ow + 0];
auto h = cordPtr[3 * ow + 1];
auto d = cordPtr[3 * ow + 2];
__m256 interp;
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
int nd = ::floor(d + 0.5f);
int nh = ::floor(h + 0.5f);
int nw = ::floor(w + 0.5f);
size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
} else { //sampleMode == GridSampleMode_BILINEAR
int w0_d = ::floor(d);
int w0_h = ::floor(h);
int w0_w = ::floor(w);
int w1_d = ::ceil(d);
int w1_h = ::ceil(h);
int w1_w = ::ceil(w);
auto oneV = _mm256_set1_ps(1.0f);
auto f0 = _mm256_set1_ps((float)w1_w - w);
auto f1 = _mm256_sub_ps(oneV, f0);
auto h0 = _mm256_set1_ps((float)w1_h - h);
auto h1 = _mm256_sub_ps(oneV, h0);
auto d0 = _mm256_set1_ps((float)w1_d - d);
auto d1 = _mm256_sub_ps(oneV, d0);
size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
__m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
__m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
__m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
__m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
__m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
__m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
__m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
__m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
__m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
__m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
__m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
__m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
__m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
__m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
_mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
}
}
}
void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
size_t bStride, size_t height) {
@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) {
coreFunction->MNNMatrixAdd = _AVX_MNNMatrixAdd;
coreFunction->MNNMatrixSub = _AVX_MNNMatrixSub;
coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise;
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23;
coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23;
coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit;
coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) {
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;

View File

@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
}
auto blockNum = post->blockNum;
const float* biasPtr = nullptr;
const float* bias_dz = nullptr;
const float* extraB_dz = nullptr;
@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
}
}
}
int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
if (realDst == GEMMINT8_AVX512_E) {
for (int dz = 0; dz < dzU; ++dz) {
auto weight_dz = weight + dz * weightZStride;
@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
}
auto blockNum = post->blockNum;
const float* biasPtr = nullptr;
const float* bias_dz = nullptr;
const float* extraB_dz = nullptr;
@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
}
}
}
int weight_step_Z = static_cast<int32_t>(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)
if (realDst == GEMMINT8_AVX512_E) {

View File

@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
}
auto blockNum = post->blockNum;
const float* biasPtr = nullptr;
const float* bias_dz = nullptr;
const float* extraB_dz = nullptr;
@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
biasPtr = post->biasFloat;
}
int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
auto srcKernelSumPtr = post->srcKernelSum;
__m512 kernelSum0 = _mm512_setzero_ps();
@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
}
auto blockNum = post->blockNum;
const float* biasPtr = nullptr;
const float* bias_dz = nullptr;
const float* extraB_dz = nullptr;
@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
__m512 kernelSum2 = _mm512_setzero_ps();
__m512 kernelSum3 = _mm512_setzero_ps();
int weight_step_Z = static_cast<int32_t>(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
const __m512i mask = _mm512_set1_epi8(0xf);
if (GEMMINT8_AVX512_E == realDst) {

View File

@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B,
}
}
void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
__m512 dstValue = _mm512_setzero_ps();
const float* src_z = src;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = src_z + fy * dilateY_step;
const float* weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
const float* weight_x = weight_y + PACK_UNIT * fx;
const float* src_x = src_y + fx * dilateX_step;
dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
}
}
_mm512_storeu_ps(dst, dstValue);
}
void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
int dx, fx, fy;
const int unit = 4;
int widthUnit = width / unit;
int widthRemain = width - widthUnit * unit;
const float* weight_z = weight;
auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0));
auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1));
auto bv = _mm512_loadu_ps(bias);
for (int y = 0; y < height; ++y) {
auto srcY = src + y * srcHStep;
auto dstY = dst + y * dstHStep;
for (dx = 0; dx < widthUnit; ++dx) {
auto dstValue0 = _mm512_setzero_ps();
auto dstValue1 = _mm512_setzero_ps();
auto dstValue2 = _mm512_setzero_ps();
auto dstValue3 = _mm512_setzero_ps();
auto dstValue0 = bv;
auto dstValue1 = bv;
auto dstValue2 = bv;
auto dstValue3 = bv;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = srcY + fy * dilateY_step;
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
}
}
dstValue0 = _mm512_min_ps(dstValue0, maxF);
dstValue1 = _mm512_min_ps(dstValue1, maxF);
dstValue2 = _mm512_min_ps(dstValue2, maxF);
dstValue3 = _mm512_min_ps(dstValue3, maxF);
dstValue0 = _mm512_max_ps(dstValue0, minF);
dstValue1 = _mm512_max_ps(dstValue1, minF);
dstValue2 = _mm512_max_ps(dstValue2, minF);
dstValue3 = _mm512_max_ps(dstValue3, minF);
_mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
_mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
_mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
}
for (dx = 0; dx < widthRemain; ++dx) {
float* dst_x = dstY + dx * PACK_UNIT;
auto dstValue = _mm512_setzero_ps();
auto dstValue = bv;
const float* src_z = srcY + src_w_setup * dx;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
}
}
dstValue = _mm512_min_ps(dstValue, maxF);
dstValue = _mm512_max_ps(dstValue, minF);
_mm512_storeu_ps(dst_x, dstValue);
}
}
@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH,
}
}
static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
if (padMode == true) { //padMode == BorderMode_ZEROS
if (h < 0 || h >= height || w < 0 || w >= width) {
return -1;
}
} else {
// Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
// For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
// the leftover reflections degrade to GridSamplePaddingMode_BORDER
h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
}
return h * width * PACK_UNIT + w * PACK_UNIT;
}
void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
for (auto ow = 0; ow < outW; ++ow) {
auto w = cordPtr[2 * ow + 0];
auto h = cordPtr[2 * ow + 1];
__m512 interp;
if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
int nh = ::floor(h + 0.5f);
int nw = ::floor(w + 0.5f);
size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns);
_mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
} else { //sampleMode == GridSampleMode_BILINEAR
int w0_h = ::floor(h);
int w0_w = ::floor(w);
int w1_h = ::ceil(h);
int w1_w = ::ceil(w);
auto oneV = _mm512_set1_ps(1.0f);
auto f0 = _mm512_set1_ps((float)w1_w - w);
auto f1 = _mm512_sub_ps(oneV, f0);
auto h0 = _mm512_set1_ps((float)w1_h - h);
auto h1 = _mm512_sub_ps(oneV, h0);
size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
for (int k = 0; k < channelCUnit; ++k) {
__m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00);
__m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01);
__m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10);
__m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11);
__m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1));
__m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1));
interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1));
_mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
}
}
}
}
void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
Vec16 max = Vec16(-FLT_MAX);
for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) {
coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;
coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise;
coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23;
coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23;
coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit;
coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) {
coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;

View File

@ -11,40 +11,25 @@
#define PACK_UNIT 8
void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
int fx, fy;
__m256 dstValue = _mm256_setzero_ps();
const float* src_z = src;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = src_z + fy * dilateY_step;
const float* weight_y = weight_z + fy * weight_y_step;
for (fx = 0; fx < fw; ++fx) {
const float* weight_x = weight_y + PACK_UNIT * fx;
const float* src_x = src_y + fx * dilateX_step;
dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
}
}
_mm256_storeu_ps(dst, dstValue);
}
void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
int dx, fx, fy;
const int unit = 4;
int widthUnit = width / unit;
int widthRemain = width - widthUnit * unit;
const float* weight_z = weight;
auto minF = _mm256_broadcast_ss(parameters + 0);
auto maxF = _mm256_broadcast_ss(parameters + 1);
auto bv = _mm256_loadu_ps(bias);
for (int y = 0; y < height; ++y) {
auto srcY = src + y * srcHStep;
auto dstY = dst + y * dstHStep;
for (dx = 0; dx < widthUnit; ++dx) {
auto dstValue0 = _mm256_setzero_ps();
auto dstValue1 = _mm256_setzero_ps();
auto dstValue2 = _mm256_setzero_ps();
auto dstValue3 = _mm256_setzero_ps();
auto dstValue0 = bv;
auto dstValue1 = bv;
auto dstValue2 = bv;
auto dstValue3 = bv;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = srcY + fy * dilateY_step;
const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
}
}
dstValue0 = _mm256_min_ps(dstValue0, maxF);
dstValue1 = _mm256_min_ps(dstValue1, maxF);
dstValue2 = _mm256_min_ps(dstValue2, maxF);
dstValue3 = _mm256_min_ps(dstValue3, maxF);
dstValue0 = _mm256_max_ps(dstValue0, minF);
dstValue1 = _mm256_max_ps(dstValue1, minF);
dstValue2 = _mm256_max_ps(dstValue2, minF);
dstValue3 = _mm256_max_ps(dstValue3, minF);
_mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
_mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
_mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
}
for (dx = 0; dx < widthRemain; ++dx) {
float* dst_x = dstY + dx * PACK_UNIT;
auto dstValue = _mm256_setzero_ps();
auto dstValue = bv;
const float* src_z = srcY + src_w_setup * dx;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
}
}
dstValue = _mm256_min_ps(dstValue, maxF);
dstValue = _mm256_max_ps(dstValue, minF);
_mm256_storeu_ps(dst_x, dstValue);
}
}
@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu
void _AVX_ExtraInitFMA(void* functions) {
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA;
coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA;
// sparse conv init
coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;

View File

@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep);
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);

View File

@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
if (post->biasFloat) {
biasPtr = post->biasFloat;
}
auto blockNum = post->blockNum;
for (int dz = 0; dz < dst_depth_quad; ++dz) {
const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
const float* scale_dz = nullptr;
scale_dz = post->scale + dz * GEMM_INT8_UNIT;
@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
if (post->biasFloat) {
biasPtr = post->biasFloat;
}
int blockNum = post->blockNum;
int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
auto oneValue = _mm_set1_epi16(1);

View File

@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo
void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
size_t srcHStep, size_t dstHStep) {
size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
int dx, fx, fy;
const int unit = 8;
int widthUnit = width / unit;
@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
if (need4) {
widthRemain-=4;
}
auto minF = _mm_set1_ps(parameters[0]);
auto maxF = _mm_set1_ps(parameters[1]);
auto bv = _mm_loadu_ps(bias);
for (int y = 0; y < height; ++y) {
auto srcY = src + y * srcHStep;
auto dstY = dst + y * dstHStep;
for (dx = 0; dx < widthUnit; ++dx) {
auto dstValue0 = _mm_set1_ps(0.0f);
auto dstValue1 = _mm_set1_ps(0.0f);
auto dstValue2 = _mm_set1_ps(0.0f);
auto dstValue3 = _mm_set1_ps(0.0f);
auto dstValue4 = _mm_set1_ps(0.0f);
auto dstValue5 = _mm_set1_ps(0.0f);
auto dstValue6 = _mm_set1_ps(0.0f);
auto dstValue7 = _mm_set1_ps(0.0f);
auto dstValue0 = bv;
auto dstValue1 = bv;
auto dstValue2 = bv;
auto dstValue3 = bv;
auto dstValue4 = bv;
auto dstValue5 = bv;
auto dstValue6 = bv;
auto dstValue7 = bv;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = srcY + fy * dilateY_step;
const float* weight_y = weight_z + fy * fw * 4;
@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
}
}
dstValue0 = _mm_min_ps(dstValue0, maxF);
dstValue1 = _mm_min_ps(dstValue1, maxF);
dstValue2 = _mm_min_ps(dstValue2, maxF);
dstValue3 = _mm_min_ps(dstValue3, maxF);
dstValue4 = _mm_min_ps(dstValue4, maxF);
dstValue5 = _mm_min_ps(dstValue5, maxF);
dstValue6 = _mm_min_ps(dstValue6, maxF);
dstValue7 = _mm_min_ps(dstValue7, maxF);
dstValue0 = _mm_max_ps(dstValue0, minF);
dstValue1 = _mm_max_ps(dstValue1, minF);
dstValue2 = _mm_max_ps(dstValue2, minF);
dstValue3 = _mm_max_ps(dstValue3, minF);
dstValue4 = _mm_max_ps(dstValue4, minF);
dstValue5 = _mm_max_ps(dstValue5, minF);
dstValue6 = _mm_max_ps(dstValue6, minF);
dstValue7 = _mm_max_ps(dstValue7, minF);
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
srcY += unit * src_w_setup;
}
if (need4) {
auto dstValue0 = _mm_set1_ps(0.0f);
auto dstValue1 = _mm_set1_ps(0.0f);
auto dstValue2 = _mm_set1_ps(0.0f);
auto dstValue3 = _mm_set1_ps(0.0f);
auto dstValue0 = bv;
auto dstValue1 = bv;
auto dstValue2 = bv;
auto dstValue3 = bv;
for (fy = 0; fy < fh; ++fy) {
const float* src_y = srcY + fy * dilateY_step;
const float* weight_y = weight_z + fy * fw * 4;
@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
}
}
dstValue0 = _mm_min_ps(dstValue0, maxF);
dstValue1 = _mm_min_ps(dstValue1, maxF);
dstValue2 = _mm_min_ps(dstValue2, maxF);
dstValue3 = _mm_min_ps(dstValue3, maxF);
dstValue0 = _mm_max_ps(dstValue0, minF);
dstValue1 = _mm_max_ps(dstValue1, minF);
dstValue2 = _mm_max_ps(dstValue2, minF);
dstValue3 = _mm_max_ps(dstValue3, minF);
_mm_storeu_ps(dstY + 4 * 0, dstValue0);
_mm_storeu_ps(dstY + 4 * 1, dstValue1);
_mm_storeu_ps(dstY + 4 * 2, dstValue2);
@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
}
for (dx = 0; dx < widthRemain; ++dx) {
float* dst_x = dstY + dx * 4;
auto dstValue = _mm_set1_ps(0.0f);
auto dstValue = bv;
const float* src_z = srcY + src_w_setup * dx;
const float* weight_z = weight;
for (fy = 0; fy < fh; ++fy) {
@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
}
}
dstValue = _mm_min_ps(dstValue, maxF);
dstValue = _mm_max_ps(dstValue, minF);
_mm_storeu_ps(dst_x, dstValue);
}
}

View File

@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal =
" out_data[gid.x]=(M4)(norm);\n"
" }\n"
"}\n"
"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
" device M4 *out [[buffer(1)]],\n"
" constant layernorm_constants& cst [[buffer(2)]],\n"
" const device float4 *gamma [[buffer(3)]],\n"
" const device float4 *beta [[buffer(4)]],\n"
" uint gid [[threadgroup_position_in_grid]],\n"
" uint tiisg[[thread_index_in_simdgroup]],\n"
" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
" int total_idx=(gid*4+sgitg);\n"
" int in_idx=total_idx % (cst.inside/4);\n"
" int out_idx=total_idx/(cst.inside/4);\n"
" auto in_data=in+out_idx*cst.inside/4;\n"
" auto out_data=out+out_idx*cst.inside/4;\n"
" float square_sum=0.0f;\n"
" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
" M4 data=in_data[i];\n"
" float dis=data.x;\n"
" square_sum += dis*dis;\n"
" dis=data.y;\n"
" square_sum += dis*dis;\n"
" dis=data.z;\n"
" square_sum += dis*dis;\n"
" dis=data.w;\n"
" square_sum += dis*dis;\n"
" }\n"
" square_sum=simd_sum(square_sum);\n"
" \n"
" if(tiisg == 0) {\n"
" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
" \n"
" float4 norm=var*((float4)in_data[in_idx]);\n"
" if(cst.has_gamma_beta) {\n"
" out_data[in_idx]=(M4)(norm*gamma[in_idx]+beta[in_idx]);\n"
" } else {\n"
" out_data[in_idx]=(M4)(norm);\n"
" }\n"
" }\n"
"}\n"
;
const char* shader_MetalConvolutionWinograd_metal =
"struct winograd_constants {\n"
@ -1578,6 +1616,60 @@ const char* shader_MetalConvolution1x1_metal =
" //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
" //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
"}\n"
"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
" device M4 *out [[buffer(1)]],\n"
" constant conv1x1_constants& cst [[buffer(2)]],\n"
" const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
" const device M4 *biasTerms [[buffer(4)]],\n"
" const device float4 *dequantScale [[buffer(5)]],\n"
" uint3 gid[[threadgroup_position_in_grid]],\n"
" uint tiisg[[thread_index_in_simdgroup]],\n"
" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
" int uz=gid.x*2+sgitg;\n"
" int rx=gid.y;\n"
" auto xy_wt=wt+uz*cst.input_slice;\n"
" auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
" auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n"
" auto biasValue=FLOAT4(biasTerms[uz]);\n"
" FLOAT4 result0=FLOAT4(0);\n"
" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
" for (int bi=0; bi<cst.block_size; bi++) {\n"
" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
" int zmin=bi*block;\n"
" int zmax=min(zmin+block,cst.input_slice);\n"
" for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
" auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
" MNN::uchar4x2 w_int4=xy_wt[z];\n"
" FLOAT4x4 w_dequant;\n"
" for (int i=0; i<4; ++i) {\n"
" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
" FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
" w_dequant[i]=res;\n"
" }\n"
" result0 += FLOAT4(in40*w_dequant);\n"
" \n"
"// FLOAT4x4 w_dequant;\n"
"// for (int i=0; i<4; ++i) {\n"
"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
"// w_dequant[i]=w4;\n"
"// }\n"
"//\n"
"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
" }\n"
" }\n"
" FLOAT4 res;\n"
" res.x=simd_sum(result0.x);\n"
" res.y=simd_sum(result0.y);\n"
" res.z=simd_sum(result0.z);\n"
" res.w=simd_sum(result0.w);\n"
" /* true */\n"
" if (tiisg == 0) {\n"
" xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
" }\n"
"}\n"
"kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
" device M4 *out [[buffer(1)]],\n"
" constant conv1x1_constants& cst [[buffer(2)]],\n"
@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal =
"// \n"
"// Macro\n"
"// \n"
"#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n"
"#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
"#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
"// whether computer with float32 when store with float16\n"

View File

@ -33,8 +33,8 @@ typedef enum {
/** metal device */
@property (strong, nonatomic, readonly) id<MTLDevice> device;
/** max memory length cound be used in threadgroup */
@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
@property (assign, nonatomic, readonly) BOOL isIphone;
@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;
/**
* @brief alloc temp buffer on device

View File

@ -79,30 +79,17 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
}
}
+ (BOOL)commit_frequent{
struct utsname systemInfo;
uname(&systemInfo);
NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
if ([deviceString isEqualToString:@"iPhone10,1"]) return YES; //@"iPhone 8 Global";
if ([deviceString isEqualToString:@"iPhone10,2"]) return YES; //@"iPhone 8 Plus Global";
if ([deviceString isEqualToString:@"iPhone10,4"]) return YES; //@"iPhone 8 GSM";
if ([deviceString isEqualToString:@"iPhone10,5"]) return YES; //@"iPhone 8 Plus GSM";
if ([deviceString isEqualToString:@"iPhone10,3"]) return YES; //@"A1865/A1902 iPhone X";
if ([deviceString isEqualToString:@"iPhone10,6"]) return YES; //@"Global/A1901 iPhone X";
if ([deviceString isEqualToString:@"iPhone11,2"]) return YES; //@"iPhone XS";
if ([deviceString isEqualToString:@"iPhone11,4"]) return YES; //@"iPhone XS Max";
if ([deviceString isEqualToString:@"iPhone11,6"]) return YES; //@"iPhone XS Max";
if ([deviceString isEqualToString:@"iPhone11,8"]) return YES; //@"iPhone XR";
if ([deviceString isEqualToString:@"iPhone12,1"]) return YES; //@"iPhone 11";
if ([deviceString isEqualToString:@"iPhone12,3"]) return YES; //@"iPhone 11 Pro";
if ([deviceString isEqualToString:@"iPhone12,5"]) return YES; //@"iPhone 11 Pro Max";
if ([deviceString isEqualToString:@"iPhone12,8"]) return YES; //@"iPhone SE 2";
if ([deviceString isEqualToString:@"iPhone13,1"]) return YES; //@"iPhone 12 mini";
if ([deviceString isEqualToString:@"iPhone13,2"]) return YES; //@"iPhone 12";
if ([deviceString isEqualToString:@"iPhone13,3"]) return YES; //@"iPhone 12 Pro";
if ([deviceString isEqualToString:@"iPhone13,4"]) return YES; //@"iPhone 12 Pro Max";
+ (BOOL)isSimdGroupAvailable{
#if TARGET_OS_IPHONE
if(@available(iOS 14, *)) {
return YES;
}
#endif
#if TARGET_OS_MAC
if(@available(macOS 10.14, *)) {
return YES;
}
#endif
return NO;
}
@ -124,8 +111,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
_device = context->device;
_cachesFp16 = [NSMutableDictionary dictionary];
_cachesFp32 = [NSMutableDictionary dictionary];
_isCommitEachShader = self.class.commit_frequent;
_isIphone = self.class.isIphone;
_isSimdGroupAvailable = self.class.isSimdGroupAvailable;
createLibrary(_device, _cachesFp16, true);
createLibrary(_device, _cachesFp32, false);
return nil != _device;

View File

@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
const device int* mask [[buffer(4)]],
#endif
constant Param& param [[buffer(5)]],
uint3 gid[[thread_position_in_grid]]) {
uint3 gid[[thread_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
const int x = gid.x; // query_seq_len
const int y = gid.y; // head_num
const int z = gid.z; // key_seq_len
@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
}
}
out *= Vscale;
output[y + z * head_num] = (T)out;
output[y * key_seq_len + z] = (T)out;
#endif
}
@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]],
}
output[ x * stride * group + (y * head_dim + z)] = out;
#else
device const T *A_offset = input0 + y;
device const T *A_offset = input0 + y * value_seq_len;
device const T *B_offset = input1 + offset_head;
device T *Pastvalue_offset = past_value + offset_head;
float out = 0;
for(int i = 0; i < value_seq_len - 1; ++i){
float A = (float)A_offset[i * head_num];
float A = (float)A_offset[i];
float B = (float)Pastvalue_offset[i * stride];
out += A * B;
}
out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
if (yr == 0) {
Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
}
@ -282,6 +284,7 @@ void AttentionBufExecution::reallocKVCache() {
void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
auto query = inputs[0];
auto key = inputs[1];
auto value = inputs[2];
@ -407,8 +410,8 @@ void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const
// For softmax parameter
int inside, outside;
if (mIsDecode) {
inside = mNumHead;
outside = 1;
inside = 1;
outside = mNumHead;
} else {
inside = 1;
outside = mCache->mKv_seq_len * mNumHead;

View File

@ -190,9 +190,6 @@ public:
void flushEncoder() const;
id<MTLComputeCommandEncoder> encoder_for_net() const;
void addOpEncoder(std::function<void(void)> opEncoder);
bool isCommandEncoderSet();
BufferAllocator* getBufferPool() const;
EagerBufferAllocator *getStaticBufferPool() const {
@ -233,11 +230,8 @@ private:
const MetalRuntime* mRuntime;
mutable NSUInteger mEncoderCount = 0;
mutable bool mOpEncoderSet = false;//whether has set encoder
mutable bool mSupportDeferEncode = true;
mutable bool mFrameEncodeCache = false;
std::vector<std::function<void(void)>> mOpEncoders;
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;

View File

@ -229,6 +229,7 @@ Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std
}
return NULL;
}
//MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
auto exe = iter->second->onCreate(inputs, op, this, outputs);
if (NULL == exe) {
@ -258,15 +259,8 @@ void MetalBackend::onExecuteBegin() const {
void MetalBackend::onExecuteEnd() const {
flushEncoder();
commit_net();
if(mFrameEncodeCache) {
// Prepare for next execute
for(auto opEncoder : mOpEncoders) {
opEncoder();
}
mOpEncoderSet = true;
}
}
BufferAllocator* MetalBackend::getBufferPool() const {
return mCurrentAllocator;
}
@ -302,18 +296,11 @@ bool MetalBackend::onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
return true;
}
bool MetalBackend::isCommandEncoderSet() {
return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
}
bool MetalBackend::isCmdBufferCommit() {
auto ctx = (__bridge MNNMetalContext *)context();
if(!ctx.isCommitEachShader) {
return false;
}
//TODO: set magic number
const int magicNum = 2;
const int magicNum = mRuntime->hint().encorderNumForCommit;
mEncoderCount++;
if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
return true;
@ -321,12 +308,6 @@ bool MetalBackend::isCmdBufferCommit() {
return false;
}
void MetalBackend::addOpEncoder(std::function<void(void)> opEncoder) {
if(mFrameEncodeCache) {
mOpEncoders.push_back(opEncoder);
}
}
id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
// reuse
@ -535,10 +516,6 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
})metal";
void MetalBackend::onResizeBegin() {
mFrameEncodeCache = false;
mOpEncoderSet = false;
mOpEncoders.clear();
// Abort last inference task if needed
flushEncoder();
_commandBuffer_net = nil;
@ -549,7 +526,6 @@ void MetalBackend::onResizeBegin() {
ErrorCode MetalBackend::onResizeEnd() {
auto ctx = (__bridge MNNMetalContext *)context();
mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
return mCurrentAllocator->compute();
}
@ -711,9 +687,8 @@ void MetalBackend::onCopyDeviceToDevice(const Tensor *src, const Tensor *dst,
void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
if(!mFrameEncodeCache) {
commit_net();
}
_resetDynamicMemory();
onCopyBuffer(src, dst, nil, nil);
}
@ -789,9 +764,8 @@ void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst, id<MTLComp
int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
flushEncoder();
auto ctx = (__bridge MNNMetalContext *)context();
if(!mOpEncoderSet) {
commit_net();
}
if (toCpu) {
wait();
}

View File

@ -87,9 +87,17 @@ ErrorCode MetalConvolution1x1::onResize(const std::vector<Tensor *> &inputs, con
std::string name = "conv1x1_g1z4_w8";
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
if (mDequantBits == 4) {
if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
name = "conv1x1_g1z4_m1w4";
mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
return NO_ERROR;
} else {
mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
name = "conv1x1_g1z4_w4";
}
}
NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
(id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
mConstBuffer, (((MetalRuntimeAllocator::MetalBufferAlloc *)mWeight->deviceId()))->getBuffer(),

View File

@ -18,10 +18,6 @@ MetalExecution::MetalExecution(Backend *backend) : Execution(backend) {
ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
if(backend->isCommandEncoderSet()) {
return NO_ERROR;
}
auto func = [=](){
auto encoder = backend->encoder_for_net();
this->onEncode(inputs, outputs, encoder);
@ -31,7 +27,6 @@ ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const s
}
};
func();
backend->addOpEncoder(func);
return NO_ERROR;
}

View File

@ -26,7 +26,7 @@ using namespace metal;
#endif
struct grid_sample_params {
int batches;
int batch;
int channels;
int inH;
int inW;
@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]],
device T *output [[buffer(2)]],
constant grid_sample_params &p [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) {
if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches)
if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch)
return;
int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]],
const int channelC4 = (p.channels + 3) / 4;
for (int c = 0; c < channelC4; ++ c) {
auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x;
auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch;
#if GRID3D
output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
#else

View File

@ -76,6 +76,7 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
((int *)mShapeBuffer.contents)[3] = (int)has_gamma_beta_;
bool parallel = (mInside > 32) && ((mInside & 3) == 0);
if(RMSNorm){
mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
@ -85,10 +86,17 @@ ErrorCode MetalLayerNorm::onResize(const std::vector<Tensor *> &inputs, const st
auto inside = parallel ? mInside/4 : mInside;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
if(context.isSimdGroupAvailable) {
if(mOutside == 1 && RMSNorm && parallel) {
mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
}
}
return NO_ERROR;
}
void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
auto input = inputs[0], output = outputs[0];

View File

@ -550,6 +550,7 @@ public:
}
virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
id<MTLComputeCommandEncoder> encoder) override {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto dstTensor = mTensors[cmd->indexes()->data()[0]];
auto srcTensor = mTensors[cmd->indexes()->data()[1]];

View File

@ -28,13 +28,10 @@ public:
MTLSize global;
};
private:
std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
std::map<Tensor*, BlitInfo> mTempInputCopy;
std::shared_ptr<Tensor> mTempOutput;
bool mNeedZero = false;
Tensor* mOutputPtr = nullptr;
id<MTLComputePipelineState> mBlitPipeline;
std::vector<id<MTLBuffer>> mShapeTemp;
std::vector<id<MTLComputePipelineState>> mBlitPipeline;
id<MTLBuffer> mZeroCopy = nil;
id<MTLComputePipelineState> mZeroPipeline;
};

View File

@ -35,6 +35,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re
info.extent[3] = sampler.dst.offset;
}
static std::string getUnitName(int bytes) {
std::string unitName;
switch (bytes) {
case 1:
unitName = "uchar";
break;
case 2:
unitName = "short";
break;
case 4:
unitName = "int";
break;
case 8:
unitName = "short4";
break;
case 16:
unitName = "int4";
break;
default:
FUNC_PRINT(bytes);
break;
}
return unitName;
}
static const char* gMultiBlitMetal = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]],
}
)metal";
static const char* gMultiRasterTemplate = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct SamplerInfo {
uint4 stride;//stride[3] + offset
uint4 size;//size[3] + totalSize
uint4 extent;//dstStride[3]+dstOffset
};
kernel void main0(const device T *in [[buffer(0)]],
device T *out [[buffer(1)]],
const device uint4* buf [[buffer(2)]],
uint3 tgid [[thread_position_in_grid]]) {
uint4 limit = buf[2];
const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3);
uint3 gid = tgid;
gid.x = tgid.x % limit.x;
uint n = tgid.x / limit.x;
if (n < limit.y) {
SamplerInfo info = infoP[n];
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
#ifdef INPUT_FORMAT_NCHW
int srcOffsetReal = srcOffset;
#elif INPUT_FORMAT_NHWC
int srcOffsetReal = srcOffset;
#elif INPUT_FORMAT_C4NHW4
uint4 src_shape = buf[0];//src nchw
int src_batch = src_shape.x;
int src_channel = src_shape.y;
int src_height = src_shape.z;
int src_width = src_shape.w;
int in_w = srcOffset % src_width; srcOffset /= src_width;
int in_h = srcOffset % src_height; srcOffset /= src_height;
int in_c = srcOffset % src_channel;
int in_b = srcOffset / src_channel;
int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
#endif
#ifdef OUTPUT_FORMAT_NCHW
int dstOffsetReal = dstOffset;
#elif OUTPUT_FORMAT_NHWC
int dstOffsetReal = dstOffset;
#elif OUTPUT_FORMAT_C4NHW4
uint4 dst_shape = buf[1];//dst nchw
int dst_batch = dst_shape.x;
int dst_channel = dst_shape.y;
int dst_height = dst_shape.z;
int dst_width = dst_shape.w;
int out_w = dstOffset % dst_width; dstOffset /= dst_width;
int out_h = dstOffset % dst_height; dstOffset /= dst_height;
int out_c = dstOffset % dst_channel;
int out_b = dstOffset / dst_channel;
int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
#endif
out[dstOffsetReal] = in[srcOffsetReal];
}
}
}
)metal";
static const char* gSingleRasterTemplate = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
using namespace metal;
struct SamplerInfo {
uint4 stride;//stride[3] + offset
uint4 size;//size[3] + totalSize
uint4 extent;//dstStride[3]+dstOffset
};
kernel void main0(const device T *in [[buffer(0)]],
device T *out [[buffer(1)]],
const device uint4* buf [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
SamplerInfo info = *((const device SamplerInfo*)(buf + 3));
if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
#ifdef INPUT_FORMAT_NCHW
int srcOffsetReal = srcOffset;
#elif INPUT_FORMAT_NHWC
int srcOffsetReal = srcOffset;
#elif INPUT_FORMAT_C4NHW4
uint4 src_shape = buf[0];//src nchw
int src_batch = src_shape.x;
int src_channel = src_shape.y;
int src_height = src_shape.z;
int src_width = src_shape.w;
int in_w = srcOffset % src_width; srcOffset /= src_width;
int in_h = srcOffset % src_height; srcOffset /= src_height;
int in_c = srcOffset % src_channel;
int in_b = srcOffset / src_channel;
int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
#endif
#ifdef OUTPUT_FORMAT_NCHW
int dstOffsetReal = dstOffset;
#elif OUTPUT_FORMAT_NHWC
int dstOffsetReal = dstOffset;
#elif OUTPUT_FORMAT_C4NHW4
uint4 dst_shape = buf[1];//dst nchw
int dst_batch = dst_shape.x;
int dst_channel = dst_shape.y;
int dst_height = dst_shape.z;
int dst_width = dst_shape.w;
int out_w = dstOffset % dst_width; dstOffset /= dst_width;
int out_h = dstOffset % dst_height; dstOffset /= dst_height;
int out_c = dstOffset % dst_channel;
int out_b = dstOffset / dst_channel;
int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
#endif
out[dstOffsetReal] = in[srcOffsetReal];
}
}
)metal";
static const char* gFillInt4 = R"metal(
#include <metal_stdlib>
#include <simd/simd.h>
@ -105,32 +249,13 @@ kernel void main0(device int4 *out [[buffer(0)]],
id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
auto mtbn = static_cast<MetalBackend*>(backend);
std::string pipelineName;
std::string unitName;
std::string unitName = getUnitName(bytes);
if (multiRegion) {
pipelineName = "blit_multi";
} else {
pipelineName = "blit";
}
switch (bytes) {
case 1:
unitName = "uchar";
break;
case 2:
unitName = "short";
break;
case 4:
unitName = "int";
break;
case 8:
unitName = "short4";
break;
case 16:
unitName = "int4";
break;
default:
FUNC_PRINT(bytes);
break;
}
std::vector<std::string> keys = {
unitName,
pipelineName
@ -159,9 +284,6 @@ MetalRaster::~MetalRaster() {
if (nil != mZeroCopy) {
mtbn->returnConstBuffer(mZeroCopy);
}
for (auto b : mShapeTemp) {
mtbn->returnConstBuffer(b);
}
}
struct MemsetInfo {
int value[4];
@ -197,9 +319,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
}
}
mTempInput.clear();
mTempInputCopy.clear();
mTempOutput = nullptr;
mOutputPtr = output;
#ifndef MNN_METAL_FORBID_RASTER_C4
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
@ -216,7 +337,8 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
}
}
if (fast) {
mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true);
mBlitPipeline.resize(1);
mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true);
std::map<Tensor*, std::vector<int>> collectForTensor;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
@ -249,7 +371,7 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
}
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
blit.global = local.first;
blit.local = local.second;
mTempInputCopy.insert(std::make_pair(iter.first, blit));
@ -258,57 +380,14 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
}
}
#endif
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
auto origin = slice.origin;
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
continue;
}
if (mTempInput.find(origin)!=mTempInput.end()) {
continue;
}
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor));
}
if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
mTempOutput.reset(new Tensor);
TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
}
if (nullptr != mTempOutput) {
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
mOutputPtr = mTempOutput.get();
}
for (auto& iter : mTempInput) {
auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
}
for (auto& iter : mTempInput) {
backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
}
if (nullptr != mTempOutput) {
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
mBlitPipeline = getBlitPipeline(bytes, backend(), true);
std::map<Tensor*, std::vector<int>> collectForTensor;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (nullptr == slice.origin) {
continue;
}
auto iter = mTempInput.find(slice.origin);
Tensor* t = slice.origin;
if (iter != mTempInput.end()) {
t = iter->second.get();
}
auto coliter = collectForTensor.find(t);
if (coliter == collectForTensor.end()) {
collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
@ -316,15 +395,64 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
coliter->second.emplace_back(i);
}
}
NSString* input_format;
NSString* output_format;
if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
output_format = @"OUTPUT_FORMAT_NCHW";
} else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
output_format = @"OUTPUT_FORMAT_NHWC";
} else {
output_format = @"OUTPUT_FORMAT_C4NHW4";
}
std::string unitName = getUnitName(bytes);
mBlitPipeline.resize(collectForTensor.size());
int index = 0;
for (auto& iter : collectForTensor) {
auto origin = iter.first;
if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
input_format = @"INPUT_FORMAT_NCHW";
} else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
input_format = @"INPUT_FORMAT_NHWC";
} else {
input_format = @"INPUT_FORMAT_C4NHW4";
}
std::vector<std::string> keys = {
std::string([input_format UTF8String]),
std::string([output_format UTF8String]),
unitName,
};
if(iter.second.size() == 1) {
keys.emplace_back("direct_raster_single");
} else {
keys.emplace_back("direct_raster_multi");
}
auto pipeline = mtbn->runtime()->findPipeline(keys);
if(nullptr == pipeline) {
MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
options.preprocessorMacros = @{
input_format : @"1",
output_format : @"1",
@"T" : @(unitName.c_str()),
};
if(iter.second.size() == 1) {
pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options);
} else {
pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options);
}
mtbn->runtime()->insertPipeline(keys, pipeline);
}
mBlitPipeline[index] = pipeline;
BlitInfo blit;
auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t));
auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t));
blit.blit = std::make_pair(memory.first, memory.second);
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();
auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second);
auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second);
blit.blit = std::make_pair(memory.first, memory.second);
uint32_t maxSize[3] = {1, 1, 1};
for (int v=0; v<iter.second.size(); ++v) {
auto& slice = des->regions[iter.second[v]];
@ -333,41 +461,42 @@ ErrorCode MetalRaster::onResize(const std::vector<Tensor *> &____inputs, const s
maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
}
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second);
int origin_area = 1;
for(int i = 2; i < origin->shape().size(); i++) {
origin_area *= origin->shape()[i];
}
int output_area = 1;
for(int i = 2; i < output->shape().size(); i++) {
output_area *= output->shape()[i];
}
shape[0] = ALIMAX(1, origin->shape()[0]);
shape[1] = ALIMAX(1, origin->shape()[1]);
shape[2] = ALIMAX(1, origin_area);
shape[3] = 1;
shape[4] = ALIMAX(1, output->shape()[0]);
shape[5] = ALIMAX(1, output->shape()[1]);
shape[6] = ALIMAX(1, output_area);
shape[7] = 1;
shape[8] = maxSize[0];
shape[9] = iter.second.size();
auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
blit.global = local.first;
blit.local = local.second;
mTempInputCopy.insert(std::make_pair(iter.first, blit));
}
for (auto b : mShapeTemp) {
mtbn->returnConstBuffer(b);
}
mShapeTemp.clear();
for (int i = 0; i < mTempInput.size(); ++i) {
id<MTLBuffer> shape = mtbn->getConstBuffer(0);
mShapeTemp.emplace_back(std::move(shape));
}
if (nullptr != mTempOutput) {
mShapeTemp.emplace_back(mtbn->getConstBuffer(0));
}
return NO_ERROR;
}
void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
auto backend = static_cast<MetalBackend *>(this->backend());
auto context = (__bridge MNNMetalContext *)backend->context();
int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset;
if (nullptr != mTempOutput) {
out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset;
}
if (mNeedZero) {
size_t sizeInBytes;
if (mTempOutput != nullptr) {
sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get());
} else {
sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
}
size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
size_t size = sizeInBytes / (4 * sizeof(int32_t));
auto ptr = (MemsetInfo*)[mZeroCopy contents];
ptr->size[0] = (uint32_t)size;
@ -376,28 +505,33 @@ void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vecto
[encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
[encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
}
int index = 0;
for (auto& iter : mTempInput) {
backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
}
[encoder setComputePipelineState:mBlitPipeline];
bool singlePipeline = false;
int index = 0;
if(mBlitPipeline.size() == 1) {
singlePipeline = true;
[encoder setComputePipelineState:mBlitPipeline[0]];
} else {
MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size());
}
for (auto& iter : mTempInputCopy) {
if(!singlePipeline) {
[encoder setComputePipelineState:mBlitPipeline[index++]];
}
MetalBackend::setTensor(iter.first, encoder, 0);
MetalBackend::setTensor(mOutputPtr, encoder, 1);
auto& blit = iter.second;
auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
[encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
[encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
}
if (nullptr != mTempOutput) {
backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
}
}
class MetalRasterCreator : public MetalBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
return new MetalRaster(backend);
}
};

View File

@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in [[buffer(0)]],
//if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
}
kernel void conv1x1_g1z4_m1w4(const device ftype4 *in [[buffer(0)]],
device ftype4 *out [[buffer(1)]],
constant conv1x1_constants& cst [[buffer(2)]],
const device MNN::uchar4x2 *wt [[buffer(3)]],
const device ftype4 *biasTerms [[buffer(4)]],
const device float4 *dequantScale [[buffer(5)]],
uint3 gid[[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
int uz = gid.x * 2 + sgitg;
int rx = gid.y;
auto xy_wt = wt + uz * cst.input_slice;
auto xy_in0 = in + (int)gid.z * cst.input_size + rx + 0;
auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx;
auto biasValue = FLOAT4(biasTerms[uz]);
FLOAT4 result0 = FLOAT4(0);
int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
for (int bi=0; bi<cst.block_size; bi++) {
FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
int zmin = bi * block;
int zmax = min(zmin + block, cst.input_slice);
for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
MNN::uchar4x2 w_int4 = xy_wt[z];
FLOAT4x4 w_dequant;
for (int i = 0; i < 4; ++i) {
FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
FLOAT4 res = w4 * scale[i] + dequant_bias[i];
w_dequant[i] = res;
}
result0 += FLOAT4(in40 * w_dequant);
// FLOAT4x4 w_dequant;
// for (int i = 0; i < 4; ++i) {
// FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
// FLOAT4 res = w4 * scale[i] + dequant_bias[i];
// w_dequant[i] = w4;
// }
//
// FLOAT4 temp = FLOAT4(in40 * w_dequant);
// result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
}
}
FLOAT4 res;
res.x = simd_sum(result0.x);
res.y = simd_sum(result0.y);
res.z = simd_sum(result0.z);
res.w = simd_sum(result0.w);
/* true */
if (tiisg == 0) {
xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
}
}
kernel void conv1x1_g1z8(const device ftype4 *in [[buffer(0)]],
device ftype4 *out [[buffer(1)]],
constant conv1x1_constants& cst [[buffer(2)]],

View File

@ -5,6 +5,7 @@ using namespace metal;
// Macro
//
#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32
#define UP_DIV(x, y) ( ((x) + (y) - 1) / (y) )
#define ROUND_UP(x, y) ( ((x) + (y) - 1) / (y) * (y) )

View File

@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in [[buffer(0)]],
out_data[gid.x] = (ftype4)(norm);
}
}
kernel void layernorm_m1x4_rms(const device ftype4 *in [[buffer(0)]],
device ftype4 *out [[buffer(1)]],
constant layernorm_constants& cst [[buffer(2)]],
const device float4 *gamma [[buffer(3)]],
const device float4 *beta [[buffer(4)]],
uint gid [[threadgroup_position_in_grid]],
uint tiisg[[thread_index_in_simdgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
int total_idx = (gid * 4 + sgitg);
int in_idx = total_idx % (cst.inside/4);
int out_idx = total_idx / (cst.inside/4);
auto in_data = in + out_idx * cst.inside/4;
auto out_data = out + out_idx * cst.inside/4;
float square_sum = 0.0f;
for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
ftype4 data = in_data[i];
float dis = data.x;
square_sum += dis * dis;
dis = data.y;
square_sum += dis * dis;
dis = data.z;
square_sum += dis * dis;
dis = data.w;
square_sum += dis * dis;
}
square_sum = simd_sum(square_sum);
if(tiisg == 0) {
float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
float4 norm = var * ((float4)in_data[in_idx]);
if(cst.has_gamma_beta) {
out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]);
} else {
out_data[in_idx] = (ftype4)(norm);
}
}
}

View File

@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
}
#endif
if (deviceName.find("QUALCOMM Adreno") != std::string::npos) {
if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
mGpuType = ADRENO;
// if device is QUALCOMM's and version is 2.0 , set spacial optimized param

View File

@ -7,7 +7,8 @@
//
#include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
#ifdef WIN32
#ifdef _WIN32
#include <windows.h>
#include <libloaderapi.h>
#else
#include <dlfcn.h>
@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {
bool OpenCLSymbols::UnLoadOpenCLLibrary() {
if (handle_ != nullptr) {
#if defined(WIN32)
#if defined(_WIN32)
if (FreeLibrary(handle_) == 0) {
#else
if (dlclose(handle_) != 0) {
@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() {
bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
#if defined(WIN32)
#if defined(_WIN32)
handle_ = LoadLibraryA(library_path.c_str());
if (handle_ == nullptr) {
return false;

Some files were not shown because too many files have changed in this diff Show More