Merge pull request #2580 from alibaba/feature/sync

[MNN:Sync] Sync Internal 2.7.0
This commit is contained in:
jxt1234 2023-09-04 16:28:05 +08:00 committed by GitHub
commit 9e3cc72952
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
138 changed files with 4189 additions and 2420 deletions

View File

@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
else() else()
endif() endif()
if (NOT MNN_BUILD_SHARED_LIBS) if (NOT MNN_BUILD_SHARED_LIBS)
if(APPLE) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# Static-link will not replace thread-related weak symbol in glibc with strong symbol # Static-link will not replace thread-related weak symbol in glibc with strong symbol
# in pthread library, so we need use --whole-archive to pthread # in pthread library, so we need use --whole-archive to pthread
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why # https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why

View File

@ -473,15 +473,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break; break;
case UnaryOpOperation_LOG1P: case UnaryOpOperation_LOG1P:
if(mVectorize) { if(mVectorize) {
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".y=(log(1.0+" << operand << ".y))"; ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
if(mPrecision != BackendConfig::Precision_Low) { ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
} else {
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
ss << ";\n"; ss << ";\n";
ss << inpName << ".z=(log(1.0+" << operand << ".z));\n"; ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
ss << inpName << ".w=(log(1.0+" << operand << ".w))"; ss << inpName << ".w=(log(1.0+" << operand << ".w))";
} }
} else { } else {
ss << inpName << "=(log(1.0+" << operand << "))"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(log((half)1.0+" << operand << "))";
} else {
ss << inpName << "=(log(1.0+" << operand << "))";
}
} }
break; break;
case UnaryOpOperation_FLOOR: case UnaryOpOperation_FLOOR:
@ -512,15 +519,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break; break;
case UnaryOpOperation_SIGMOID: case UnaryOpOperation_SIGMOID:
if(mVectorize) { if(mVectorize) {
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))"; ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
if(mPrecision != BackendConfig::Precision_Low) { ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
} else {
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
ss << ";\n"; ss << ";\n";
ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n"; ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))"; ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
} }
} else { } else {
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
} else {
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
}
} }
break; break;
case UnaryOpOperation_TANH: case UnaryOpOperation_TANH:
@ -538,15 +552,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break; break;
case UnaryOpOperation_RECIPROCAL: case UnaryOpOperation_RECIPROCAL:
if(mVectorize) { if(mVectorize) {
ss << inpName << ".x=(1.0/" << operand << ".x);\n"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".y=(1.0/" << operand << ".y)"; ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
if(mPrecision != BackendConfig::Precision_Low) { ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
} else {
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
ss << inpName << ".y=(1.0/" << operand << ".y)";
ss << ";\n"; ss << ";\n";
ss << inpName << ".z=(1.0/" << operand << ".z);\n"; ss << inpName << ".z=(1.0/" << operand << ".z);\n";
ss << inpName << ".w=(1.0/" << operand << ".w)"; ss << inpName << ".w=(1.0/" << operand << ".w)";
} }
} else { } else {
ss << inpName << "=(1.0/" << operand << ")"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)(1.0/(float)" << operand << ")";
} else {
ss << inpName << "=(1.0/" << operand << ")";
}
} }
break; break;
case UnaryOpOperation_LOG: case UnaryOpOperation_LOG:
@ -564,17 +585,44 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
break; break;
case UnaryOpOperation_GELU: case UnaryOpOperation_GELU:
if(mVectorize) { if(mVectorize) {
ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)"; ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".x*(float)" << operand << ".x*(float)" << operand << ".x+(float)" << operand + ".x))) * (float)" << operand << ".x* 0.5f);\n";
if(mPrecision != BackendConfig::Precision_Low) { ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".y*(float)" << operand << ".y*(float)" << operand << ".y+(float)" << operand + ".y))) * (float)" << operand << ".y* 0.5f)";
} else {
ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
ss << ";\n"; ss << ";\n";
ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n"; ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)"; ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
} }
} else { } else {
ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)"; if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << "*(float)" << operand << "*(float)" << operand << "+(float)" << operand + "))) * (float)" << operand << "* 0.5f)";
} else {
ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
}
} }
break; break;
case UnaryOpOperation_GELU_STANDARD:
if(mVectorize) {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
} else {
ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
ss << ";\n";
ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
}
} else {
if(mPrecision == BackendConfig::Precision_Low) {
ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
} else {
ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
}
}
break;
default: default:
MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type); MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type);
break; break;

View File

@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
outputs = module->onForward(inputs); outputs = module->onForward(inputs);
} }
globalExecutor->resetProfile();
outputs = module->onForward(inputs); outputs = module->onForward(inputs);
globalExecutor->dumpProfile();
{ {
MNN::Timer autoTime; MNN::Timer autoTime;
globalExecutor->resetProfile();
for (int i = 0; i < benchTime; ++i) { for (int i = 0; i < benchTime; ++i) {
MNN::AutoTime _t(0, "Once time"); MNN::AutoTime _t(0, "Once time");
// std::cout << i << std::endl; // std::cout << i << std::endl;

View File

@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
{ {
AUTOTIME; AUTOTIME;
Executor::getGlobalExecutor()->resetProfile();
outputs = model->onForward({first, second}); outputs = model->onForward({first, second});
Executor::getGlobalExecutor()->dumpProfile();
} }
std::ostringstream fileNameOs; std::ostringstream fileNameOs;
std::ostringstream dimInfo; std::ostringstream dimInfo;

View File

@ -10,7 +10,7 @@
- warm_up_count: 预热次数 - warm_up_count: 预热次数
- forwardtype: 可选默认是0即CPUforwardtype有0->CPU1->Metal3->OpenCL6->OpenGL7->Vulkan - forwardtype: 可选默认是0即CPUforwardtype有0->CPU1->Metal3->OpenCL6->OpenGL7->Vulkan
- numberThread: 可选默认是4为 CPU 线程数或者 GPU 的运行模式 - numberThread: 可选默认是4为 CPU 线程数或者 GPU 的运行模式
- precision: 可选,默认是 2 precision_low - precision: 可选,默认是2有效输入为0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算 - weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16 - weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
- testQuantizedModel 可选默认是0即只测试浮点模型取1时会在测试浮点模型后进行量化模型的测试 - testQuantizedModel 可选默认是0即只测试浮点模型取1时会在测试浮点模型后进行量化模型的测试

View File

@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
### 参数 ### 参数
`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]` `./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
- `model:str` 模型文件路径 - `model:str` 模型文件路径
- `dir:str` 输入输出信息文件夹,可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成,参考模型转换的正确性校验部分。 - `dir:str` 输入输出信息文件夹,可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成,参考模型转换的正确性校验部分。
- `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析 - `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析
- `forwardType:int` 执行推理的计算设备有效值为0CPU、1Metal、2CUDA、3OpenCL、6OpenGL7(Vulkan) 9 (TensorRT),可选,默认为`0` - `forwardType:int` 执行推理的计算设备有效值为0CPU、1Metal、2CUDA、3OpenCL、6OpenGL7(Vulkan) 9 (TensorRT),可选,默认为`0`
- `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试 - `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试
@ -456,49 +456,3 @@ Matrix:
0.0000000 0.0000000 1.0000000 0.0000000 0.0000000 1.0000000
``` ```
## winogradGenerateCL.out
### 说明
生成winograd变换矩阵程序并生成opencl转换代码
### 参数
`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
### 示例
```bash
$ ./winogradGenerateCL.out 2 2
A
1.0000000 0.0000000
1.0000000 0.5000000
0.0000000 1.0000000
B
1.0000000 0.0000000 -0.0000000
-2.0000000 2.0000000 -0.5000000
0.0000000 0.0000000 1.0000000
G
1.0000000 0.0000000
1.0000000 0.5000000
0.0000000 1.0000000
Generate winogradTransformSource2_2_0.5.cl
Generate winogradTransformDest2_2_0.5.cl
```
## winogradGenerateGLSL.out
### 说明
生成winograd变换矩阵程序并生成opengl转换代码
### 参数
`./winogradExample.out unit kernelSize`
- `unit:int` 分块大小
- `kernelSize:int` 卷积核大小
### 示例
```bash
$ ./winogradGenerateGLSL.out 1 2
A
1.0000000
B
1.0000000 -0.0000000
0.0000000 1.0000000
G
1.0000000
Generate winogradTransformSource1_2_0.5.comp
Generate winogradTransformDest1_2_0.5.comp
```

View File

@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
endif() endif()
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*") file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF) option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
IF (MNN_EXPR_ENABLE_PROFILER)
add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
ENDIF()
IF (MNN_EXPR_SHAPE_EAGER) IF (MNN_EXPR_SHAPE_EAGER)
add_definitions(-DMNN_EXPR_SHAPE_EAGER) add_definitions(-DMNN_EXPR_SHAPE_EAGER)
ENDIF() ENDIF()

View File

@ -21,55 +21,9 @@
#ifdef MNN_EXPR_ENABLE_PROFILER #ifdef MNN_EXPR_ENABLE_PROFILER
#define MNN_EXPRESS_ERROR_REPORT #define MNN_EXPRESS_ERROR_REPORT
#endif #endif
#define MNN_EXPRESS_OPEN_MEMORY_REUSE
namespace MNN { namespace MNN {
namespace Express { namespace Express {
#ifdef MNN_EXPR_ENABLE_PROFILER
class Executor::Profiler {
public:
void reset();
void dump() const;
void add(const std::string& opType, float timeInMs);
void addFlops(const std::string& opType, float flops);
private:
std::map<std::string, float> mTimes;
std::map<std::string, float> mFlops;
};
void Executor::Profiler::reset() {
mTimes.clear();
mFlops.clear();
}
void Executor::Profiler::dump() const {
float sumValue = 0.0f;
for (auto iter : mTimes) {
MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
sumValue += iter.second;
}
MNN_PRINT("Total: %f ms\n", sumValue);
sumValue = 0.0f;
for (auto iter : mFlops) {
MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
sumValue += iter.second;
}
MNN_PRINT("Total flops: %f M\n", sumValue);
}
void Executor::Profiler::add(const std::string& opType, float timeInMs) {
auto iter = mTimes.find(opType);
if (iter == mTimes.end()) {
mTimes[opType] = timeInMs;
return;
}
iter->second += timeInMs;
}
void Executor::Profiler::addFlops(const std::string& opType, float flops) {
auto iter = mFlops.find(opType);
if (iter == mFlops.end()) {
mFlops[opType] = flops;
return;
}
iter->second += flops;
}
#endif
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) { void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
std::lock_guard<std::mutex> _l(mMutex); std::lock_guard<std::mutex> _l(mMutex);
@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
//FUNC_PRINT(mCaches.size()); //FUNC_PRINT(mCaches.size());
_makeCache(expr, forceCPU); _makeCache(expr, forceCPU);
} }
void Executor::addOpCostTime(int op, float costTime) {
#ifdef MNN_EXPR_ENABLE_PROFILER
auto opType = MNN::EnumNameOpType((OpType)op);
if (nullptr == opType) {
return;
}
mProfiler->add(opType, costTime);
#endif
}
void Executor::addOpCostTime(const std::string& type, float costTime) {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->add(type, costTime);
#endif
}
void Executor::addOpFlops(const std::string& type, float flops) {
#ifdef MNN_EXPR_ENABLE_PROFILER
mProfiler->addFlops(type, flops);
#endif
}
void Executor::resetProfile() { void Executor::resetProfile() {
#ifdef MNN_EXPR_ENABLE_PROFILER // Depercated
mProfiler->reset();
#endif
} }
void Executor::dumpProfile() { void Executor::dumpProfile() {
#ifdef MNN_EXPR_ENABLE_PROFILER // Depercated
mProfiler->dump();
#endif
} }
bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) { bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {

View File

@ -15,6 +15,7 @@
#include "NMSModule.hpp" #include "NMSModule.hpp"
#include "Utils.hpp" #include "Utils.hpp"
#include "core/Backend.hpp" #include "core/Backend.hpp"
#include "core/WrapExecution.hpp"
#include "utils/InitNet.hpp" #include "utils/InitNet.hpp"
#include "RuntimeAttr.hpp" #include "RuntimeAttr.hpp"
#include "geometry/GeometryComputer.hpp" #include "geometry/GeometryComputer.hpp"
@ -490,7 +491,15 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
return submodule; return submodule;
} }
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config& config, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, bool needGeometry) { struct ModuleRuntimeConfig {
bool needGeometry;
RuntimeInfo rt;
Backend::Info compute;
const BackendConfig* userConfig = nullptr;
Session::ModeGroup modes;
};
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, const Module::Config& config, const ModuleRuntimeConfig& runtimeConfig) {
auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer()); auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
if (1 == info.opList.size()) { if (1 == info.opList.size()) {
auto op = net->oplists()->GetAs<Op>(info.opList[0]); auto op = net->oplists()->GetAs<Op>(info.opList[0]);
@ -506,9 +515,8 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
// MNN_ASSERT(false); // MNN_ASSERT(false);
} }
Schedule::ScheduleInfo scheduleInfo; Schedule::ScheduleInfo scheduleInfo;
RuntimeInfo rt;
Session::ModeGroup modes;
scheduleInfo.defaultBackend = sharedConst->defaultBackend; scheduleInfo.defaultBackend = sharedConst->defaultBackend;
scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
scheduleInfo.allTensors = sharedConst->allTensors; scheduleInfo.allTensors = sharedConst->allTensors;
initTensors(scheduleInfo.allTensors, net); initTensors(scheduleInfo.allTensors, net);
std::vector<Schedule::OpCacheInfo> oplists; std::vector<Schedule::OpCacheInfo> oplists;
@ -522,34 +530,19 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
if (breakIndex >= 0) { if (breakIndex >= 0) {
scheduleInfo.needInputContentForShape = true; scheduleInfo.needInputContentForShape = true;
} }
Backend::Info compute; auto rt = runtimeConfig.rt;
const BackendConfig* userConfig = nullptr; auto modes = runtimeConfig.modes;
if (nullptr == rtMgr) {
rt = Executor::getRuntime();
auto glo = ExecutorScope::Current();
compute.type = glo->getAttr()->firstType.first;
compute.numThread = glo->getAttr()->firstType.second;
} else {
modes = rtMgr->getInside()->modes;
rt = rtMgr->getInside()->mRuntime;
userConfig = &rtMgr->getInside()->mConfig;
compute.type = rt.first.begin()->first;
compute.numThread = 1;
// set external file info
if (!rtMgr->getInside()->mExternalFile.empty()) {
rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
}
}
Schedule::BackendCache bnCache; Schedule::BackendCache bnCache;
if (nullptr != userConfig) { Backend::Info compute = runtimeConfig.compute;
bnCache.config = *userConfig; if (nullptr != runtimeConfig.userConfig) {
bnCache.config = *runtimeConfig.userConfig;
compute.user = &bnCache.config; compute.user = &bnCache.config;
} else { } else {
compute.user = nullptr; compute.user = nullptr;
} }
bnCache.info = std::move(compute); bnCache.info = std::move(compute);
bnCache.needComputeGeometry = needGeometry; bnCache.needComputeGeometry = runtimeConfig.needGeometry;
scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists))); scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists)));
std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage}; std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage};
@ -588,13 +581,38 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
// Extra Const Tensors // Extra Const Tensors
sharedConst.reset(new Schedule::ScheduleInfo); sharedConst.reset(new Schedule::ScheduleInfo);
auto curExe = ExecutorScope::Current(); auto curExe = ExecutorScope::Current();
bool permitCodeGen = false;
if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) { if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) {
curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile); curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile);
permitCodeGen = rtMgr->getInside()->modes.codegenMode == Interpreter::Session_Codegen_Enable;
} }
std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend; std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
std::vector<std::shared_ptr<Tensor>> allTensors; std::vector<std::shared_ptr<Tensor>> allTensors;
sharedConst->allTensors.resize(net->tensorName()->size()); sharedConst->allTensors.resize(net->tensorName()->size());
sharedConst->defaultBackend = defaultBackend; sharedConst->defaultBackend = defaultBackend;
std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
ModuleRuntimeConfig& modRuntime = *modRuntimeCfgPtr;
modRuntime.needGeometry = needGeometry;
if (nullptr == rtMgr) {
modRuntime.rt = Executor::getRuntime();
auto glo = ExecutorScope::Current();
modRuntime.compute.type = glo->getAttr()->firstType.first;
modRuntime.compute.numThread = glo->getAttr()->firstType.second;
} else {
modRuntime.modes = rtMgr->getInside()->modes;
modRuntime.rt = rtMgr->getInside()->mRuntime;
modRuntime.userConfig = &rtMgr->getInside()->mConfig;
modRuntime.compute.type = modRuntime.rt.first.begin()->first;
modRuntime.compute.numThread = 1;
// set external file info
if (!rtMgr->getInside()->mExternalFile.empty()) {
modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
}
}
auto& rt = modRuntime.rt;
auto firstRt = rt.first[modRuntime.compute.type];
sharedConst->constReplaceBackend.reset(firstRt->onCreate(modRuntime.userConfig));
ErrorCode code = NO_ERROR; ErrorCode code = NO_ERROR;
std::set<int> noneedComputeIndexes; std::set<int> noneedComputeIndexes;
initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code); initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code);
@ -646,7 +664,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst); auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size()); std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
for (int i=0; i<subModulesInfo.size(); ++i) { for (int i=0; i<subModulesInfo.size(); ++i) {
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, rtMgr, *config, sharedConst, needGeometry)); subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
} }
auto result = new PipelineModule; auto result = new PipelineModule;
result->mInputSize = inputs.size(); result->mInputSize = inputs.size();
@ -702,8 +720,45 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
} }
result->registerModel(subModules); result->registerModel(subModules);
result->mSharedConst = sharedConst; result->mSharedConst = sharedConst;
if (!permitCodeGen) {
// Prereplace const tensor
auto curBackend = sharedConst->constReplaceBackend.get();
if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
for (auto& t : sharedConst->allTensors) {
if (nullptr == t.get()) {
continue;
}
auto des = TensorUtils::getDescribe(t.get());
if (des->isMutable) {
continue;
}
if (!WrapExecution::needWrap(t.get(), curBackend)) {
continue;
}
if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
continue;
}
if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
continue;
}
std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
auto outDes = TensorUtils::getDescribe(wrapTensor.get());
outDes->usage = des->usage;
auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC);
if (!tempRes) {
continue;
}
outDes->setBackend(curBackend);
curBackend->onCopyBuffer(t.get(), wrapTensor.get());
outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
TensorUtils::getDescribeOrigin(t.get())->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent;
t->buffer().host = wrapTensor->buffer().host;
t->buffer().device = wrapTensor->buffer().device;
t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims;
}
}
}
return result; return result;
} }
Module* PipelineModule::clone(CloneContext* ctx) const { Module* PipelineModule::clone(CloneContext* ctx) const {

View File

@ -430,6 +430,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second; outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
} else if (backend == mResource->mSharedConst->defaultBackend.get()) { } else if (backend == mResource->mSharedConst->defaultBackend.get()) {
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend; outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
} else if (backend == mResource->mSharedConst->constReplaceBackend.get()) {
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
} }
} }

View File

@ -195,6 +195,7 @@ public:
MAX_TUNING_NUMBER = 0, MAX_TUNING_NUMBER = 0,
// Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid // Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid
STRICT_CHECK_MODEL = 1, STRICT_CHECK_MODEL = 1,
MEM_ALLOCATOR_TYPE = 2,
}; };
/** /**
* @brief The API shoud be called before create session. * @brief The API shoud be called before create session.

View File

@ -68,7 +68,7 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR_IMP(x) #x #define STR_IMP(x) #x
#define STR(x) STR_IMP(x) #define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2 #define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 6 #define MNN_VERSION_MINOR 7
#define MNN_VERSION_PATCH 3 #define MNN_VERSION_PATCH 0
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH) #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */ #endif /* MNNDefine_h */

View File

@ -68,11 +68,6 @@ public:
struct SubGraph; struct SubGraph;
bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs); bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs);
std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName); std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName);
/**Internal Usage Begin*/
void addOpCostTime(int op, float costTime);
void addOpCostTime(const std::string& type, float costTime);
void addOpFlops(const std::string& type, float flops);
/**Internal Usage End*/
static RuntimeInfo getRuntime(); static RuntimeInfo getRuntime();
void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after); void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after);
const DebugTools* getDebugTools() const { const DebugTools* getDebugTools() const {

View File

@ -50,7 +50,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
} }
CPURuntime::CPURuntime(const Backend::Info& info) { CPURuntime::CPURuntime(const Backend::Info& info) {
mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault())); mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
mThreadNumber = info.numThread; mThreadNumber = info.numThread;
mThreadNumber = std::max(1, mThreadNumber); mThreadNumber = std::max(1, mThreadNumber);
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER); mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@ -64,6 +64,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
mMemory = info.user->memory; mMemory = info.user->memory;
mFlags = info.user->flags; mFlags = info.user->flags;
} }
mAllocator = info.allocator;
#ifdef _OPENMP #ifdef _OPENMP
switch (mPower) { switch (mPower) {
@ -218,7 +219,11 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
mMemory = memory; mMemory = memory;
mRuntime = const_cast<CPURuntime*>(runtime); mRuntime = const_cast<CPURuntime*>(runtime);
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get())); std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc)); if (mRuntime->getAllocatorType() == Runtime::Allocator_Defer) {
mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
} else {
mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
}
mStaticAllocator = runtime->mStaticAllocator; mStaticAllocator = runtime->mStaticAllocator;
mPrecisionMode = precision; mPrecisionMode = precision;
mCoreFunctions = MNNGetCoreFunctions(); mCoreFunctions = MNNGetCoreFunctions();
@ -238,24 +243,14 @@ void CPUBackend::onExecuteEnd() const {
mRuntime->onConcurrencyEnd(); mRuntime->onConcurrencyEnd();
} }
class CPUMemObj : public Backend::MemObj { void CPUBackend::onResizeBegin() {
public: mDynamicAllocator->reset();
CPUMemObj(BufferAllocator* allocator, std::pair<void*, int> points, int size) { }
mPoint = std::move(points);
mAllocator = allocator; void CPUBackend::onResizeEnd() {
mSize = size; getCache()->release();
} mDynamicAllocator->compute();
virtual ~ CPUMemObj() { }
mAllocator->free(mPoint);
}
inline int getSize() const {
return mSize;
}
private:
BufferAllocator* mAllocator;
std::pair<void*, int> mPoint;
int mSize;
};
Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) { Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
auto originMem = TensorUtils::getDescribe(dest)->mem.get(); auto originMem = TensorUtils::getDescribe(dest)->mem.get();
@ -277,35 +272,41 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
// } // }
auto& buffer = dest->buffer(); auto& buffer = dest->buffer();
auto des = TensorUtils::getDescribe(dest); auto des = TensorUtils::getDescribe(dest);
std::pair<void*, int> points; MemChunk chunk;
switch (storageType) { switch (storageType) {
case STATIC: { case STATIC: {
points = mStaticAllocator->alloc(size, false); chunk = mStaticAllocator->alloc(size, false);
break; break;
} }
case DYNAMIC: { case DYNAMIC: {
points = mDynamicAllocator->alloc(size, false); chunk = mDynamicAllocator->alloc(size, false);
break; break;
} }
case DYNAMIC_SEPERATE: { case DYNAMIC_SEPERATE: {
points = mDynamicAllocator->alloc(size, true); chunk = mDynamicAllocator->alloc(size, true);
break; break;
} }
default: default:
MNN_ASSERT(false); MNN_ASSERT(false);
break; break;
} }
if (nullptr == points.first) {
if (chunk.invalid()) {
MNN_ERROR("Alloc buffer error for cpu backend\n"); MNN_ERROR("Alloc buffer error for cpu backend\n");
return nullptr; return nullptr;
} }
Backend::MemObj* res = nullptr; Backend::MemObj* res = nullptr;
if (storageType == STATIC) { if (storageType == STATIC) {
res = new CPUMemObj(mStaticAllocator.get(), points, size); res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
} else { } else {
res = new CPUMemObj(mDynamicAllocator.get(), points, size); res = new CPUMemObj(mDynamicAllocator.get(), chunk, size);
chunk.attach(dest);
}
if (chunk.ptr()) {
buffer.host = chunk.ptr();
} }
buffer.host = (uint8_t*)points.first + points.second;
des->extra.offset = 0; des->extra.offset = 0;
return res; return res;
} }

View File

@ -13,10 +13,10 @@
#include <memory> #include <memory>
#include "core/Backend.hpp" #include "core/Backend.hpp"
#include "core/Execution.hpp" #include "core/Execution.hpp"
#include "core/BufferAllocator.hpp"
#include "MNN_generated.h" #include "MNN_generated.h"
namespace MNN { namespace MNN {
class BufferAllocator;
class CPURuntime : public Runtime { class CPURuntime : public Runtime {
public: public:
friend class CPUBackend; friend class CPUBackend;
@ -35,7 +35,7 @@ public:
private: private:
std::shared_ptr<BufferAllocator> mStaticAllocator; std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
int mThreadNumber; int mThreadNumber;
mutable int mTaskIndex; mutable int mTaskIndex;
BackendConfig::MemoryMode mMemory; BackendConfig::MemoryMode mMemory;
@ -47,11 +47,31 @@ private:
float mFlops = 0.0f; float mFlops = 0.0f;
static Backend*(*gExtraCreate)(const Runtime* runtime); static Backend*(*gExtraCreate)(const Runtime* runtime);
size_t mFlags = 0; size_t mFlags = 0;
int mAllocator = 0;
}; };
struct CoreFunctions; struct CoreFunctions;
struct CoreInt8Functions; struct CoreInt8Functions;
class CPUResizeCache; class CPUResizeCache;
class CPUMemObj : public Backend::MemObj {
public:
CPUMemObj(BufferAllocator* allocator, MemChunk chunk, int size) : mAllocator(allocator), mChunk(chunk), mSize(size) {}
virtual ~ CPUMemObj() {
if (mAllocator) {
mAllocator->free(mChunk);
}
}
virtual MemChunk chunk() {
return mChunk;
}
inline int getSize() const {
return mSize;
}
private:
BufferAllocator* mAllocator;
MemChunk mChunk;
int mSize;
};
class CPUBackend : public Backend { class CPUBackend : public Backend {
public: public:
CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0); CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0);
@ -69,6 +89,9 @@ public:
virtual void onExecuteBegin() const override; virtual void onExecuteBegin() const override;
virtual void onExecuteEnd() const override; virtual void onExecuteEnd() const override;
virtual void onResizeBegin() override;
virtual void onResizeEnd() override;
const CoreFunctions* functions() const { const CoreFunctions* functions() const {
return mCoreFunctions; return mCoreFunctions;
@ -91,7 +114,7 @@ public:
return mRuntime->mThreadNumber; return mRuntime->mThreadNumber;
} }
BufferAllocator* getBufferAllocator() const { BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
return mDynamicAllocator.get(); return mDynamicAllocator.get();
} }
@ -120,7 +143,7 @@ protected:
const CoreFunctions* mCoreFunctions; const CoreFunctions* mCoreFunctions;
const CoreInt8Functions* mInt8CoreFunctions; const CoreInt8Functions* mInt8CoreFunctions;
private: private:
std::shared_ptr<BufferAllocator> mStaticAllocator; std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
std::shared_ptr<BufferAllocator> mDynamicAllocator; std::shared_ptr<BufferAllocator> mDynamicAllocator;
CPURuntime* mRuntime; CPURuntime* mRuntime;
BackendConfig::PrecisionMode mPrecisionMode; BackendConfig::PrecisionMode mPrecisionMode;

View File

@ -208,9 +208,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
} }
} }
}; };
auto biasP = inputs[2]->host<uint8_t>();
auto weightP = inputs[1]->host<uint8_t>();
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) { mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
auto biasP = inputs[2]->host<uint8_t>();
auto weightP = inputs[1]->host<uint8_t>();
for (int index = tId; index < total; index += numberThread) { for (int index = tId; index < total; index += numberThread) {
int dz = index / batch; int dz = index / batch;
auto dst_z = dstOrigin + dst_z_step * index * bytes; auto dst_z = dstOrigin + dst_z_step * index * bytes;

View File

@ -241,6 +241,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
CPUDeconvolutionBasic::onResize(inputs, outputs); CPUDeconvolutionBasic::onResize(inputs, outputs);
auto core = static_cast<CPUBackend*>(backend())->functions(); auto core = static_cast<CPUBackend*>(backend())->functions();
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions(); auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
int bytes = core->bytes;
auto input = inputs[0]; auto input = inputs[0];
auto output = outputs[0]; auto output = outputs[0];
auto oc = output->channel(); auto oc = output->channel();
@ -270,6 +271,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
mPostFunctions.clear(); mPostFunctions.clear();
auto plane = width * height * batch; auto plane = width * height * batch;
const int maxDepth = 5; const int maxDepth = 5;
auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
//int zeroPoint = 0; //int zeroPoint = 0;
auto biasPtr = inputs[2]->host<float>(); auto biasPtr = inputs[2]->host<float>();
@ -284,6 +286,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
auto zeroPoint = outputQuant[1]; auto zeroPoint = outputQuant[1];
AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack})); AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
bool needReleaseTempInput = true;
int outi8 = 0; int outi8 = 0;
if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) { if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
outi8 = 1; outi8 = 1;
@ -306,28 +309,28 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth)); mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
tempInput->buffer().host = (uint8_t*)inputPtr; // tempInput->buffer().host = (uint8_t*)inputPtr;
needReleaseTempInput = false;
TensorUtils::getDescribe(tempInput.get())->mem.reset(new CPUMemObj(nullptr, TensorUtils::getDescribe(input)->mem->chunk(), 0));
mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()}); mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
} }
auto colBufferPtr = mTempOutput->host<uint8_t>();
auto threadNumber = ((CPUBackend*)backend())->threadNumber(); auto threadNumber = ((CPUBackend*)backend())->threadNumber();
std::vector<float> scales(core->pack * src_height * src_width * batch, scale); std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
auto outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack})); if (outputFp32Ptr.invalid()) {
auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
auto outputFp32Ptr = OutputFloat->host<uint8_t>();
mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY, mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales, strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales,
minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) { minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
auto colBufferPtr = mTempOutput->host<uint8_t>();
auto unitBytes = core->pack * core->bytes; auto unitBytes = core->pack * core->bytes;
auto tempOutPtr = outputPtr; auto tempOutPtr = outputPtr;
auto float2Int8_step = src_height * src_width * batch; auto float2Int8_step = src_height * src_width * batch;
if (outi8) { if (outi8) {
tempOutPtr = outputFp32Ptr; tempOutPtr = outputFp32Ptr.ptr();
} }
for (int z = (tId); z < ocC4; z += threadNumber) { for (int z = (tId); z < ocC4; z += threadNumber) {
auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes; auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
@ -367,7 +370,16 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
} }
} }
}, threadNumber)); }, threadNumber));
if (tempInput->host<float>() != inputPtr) { /*
if (TensorUtils::getDescribe(tempInput.get())->mem->chunk().offset() != TensorUtils::getDescribe(input)->mem->chunk().offset()) {
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
}
if (tempInput->host<float>() != inputPtr) {
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
}
*/
allocator->free(outputFp32Ptr);
if (needReleaseTempInput) {
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
} }
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);

View File

@ -7,51 +7,26 @@
// //
#include <cmath> #include <cmath>
#include "backend/cpu/CPULayerNorm.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "core/Execution.hpp" #include "core/Execution.hpp"
#include "core/Concurrency.h" #include "core/Concurrency.h"
#include "core/OpCommonUtils.hpp" #include "core/OpCommonUtils.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "backend/cpu/compute/CommonOptFunction.h"
#include "MNN_generated.h" #include "MNN_generated.h"
namespace MNN { namespace MNN {
class CPULayerNorm : public Execution {
public:
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
virtual ~CPULayerNorm();
ErrorCode onExecute(const std::vector<Tensor*> &inputs, // NOLINT
const std::vector<Tensor*> &outputs) override;
ErrorCode onResize(const std::vector<Tensor*> &inputs, // NOLINT
const std::vector<Tensor*> &outputs) override;
private:
bool allocGammaBeta(int size);
private:
int axis_size = 0;
int inner_size_ = 1;
int outter_size_ = 1;
int group_ = 1;
float epsilon_ = 0.001;
std::unique_ptr<Tensor> gamma_;
std::unique_ptr<Tensor> beta_;
bool has_gamma_beta_ = false;
};
bool CPULayerNorm::allocGammaBeta(int size) { bool CPULayerNorm::allocGammaBeta(int size) {
has_gamma_beta_ = true; mIniGammaBeta = true;
gamma_.reset(Tensor::createDevice<float>({size})); mGamma.reset(Tensor::createDevice<float>({size}));
auto status = backend()->onAcquireBuffer(gamma_.get(), Backend::STATIC); auto status = backend()->onAcquireBuffer(mGamma.get(), Backend::STATIC);
if (!status) { if (!status) {
MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n"); MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
return false; return false;
} }
beta_.reset(Tensor::createDevice<float>({size})); mBeta.reset(Tensor::createDevice<float>({size}));
status = backend()->onAcquireBuffer(beta_.get(), Backend::STATIC); status = backend()->onAcquireBuffer(mBeta.get(), Backend::STATIC);
if (!status) { if (!status) {
MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n"); MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n");
return false; return false;
@ -59,17 +34,16 @@ bool CPULayerNorm::allocGammaBeta(int size) {
return true; return true;
} }
CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend) CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend) : Execution(backend) {
: Execution(backend) {
const auto* layer_norm_param = op->main_as_LayerNorm(); const auto* layer_norm_param = op->main_as_LayerNorm();
axis_size = layer_norm_param->axis()->size(); mAxis = layer_norm_param->axis()->size();
group_ = layer_norm_param->group(); mGroup = layer_norm_param->group();
epsilon_ = layer_norm_param->epsilon(); mEpsilon = layer_norm_param->epsilon();
if (USE_EXTERNAL_DATA(layer_norm_param)) { if (USE_EXTERNAL_DATA(layer_norm_param)) {
auto size = layer_norm_param->external()->Get(1); int32_t size = static_cast<int32_t>(layer_norm_param->external()->Get(1));
allocGammaBeta(size); allocGammaBeta(size);
OpCommonUtils::loadExternalDatas(backend, {gamma_->host<char>(), beta_->host<char>()}, layer_norm_param->external()->data()); OpCommonUtils::loadExternalDatas(backend, {mGamma->host<char>(), mBeta->host<char>()}, layer_norm_param->external()->data());
return; return;
} }
@ -80,23 +54,44 @@ CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
} }
allocGammaBeta(size); allocGammaBeta(size);
const float* gamma_data = layer_norm_param->gamma()->data(); const float* gamma_data = layer_norm_param->gamma()->data();
memcpy(gamma_->host<float>(), gamma_data, size * sizeof(float)); memcpy(mGamma->host<float>(), gamma_data, size * sizeof(float));
const float* beta_data = layer_norm_param->beta()->data(); const float* beta_data = layer_norm_param->beta()->data();
memcpy(beta_->host<float>(), beta_data, size * sizeof(float)); memcpy(mBeta->host<float>(), beta_data, size * sizeof(float));
} }
} }
ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs, ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
const std::vector<Tensor*> &outputs) { const std::vector<Tensor*> &outputs) {
const float* gamma = has_gamma_beta_ ? gamma_->host<float>() : nullptr; const float* gamma = mIniGammaBeta ? mGamma->host<float>() : nullptr;
const float* beta = has_gamma_beta_ ? beta_->host<float>() : nullptr; const float* beta = mIniGammaBeta ? mBeta->host<float>() : nullptr;
if (mInpZero.data()) {
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
const int8_t* input = inputs[0]->host<int8_t>();
int8_t* output = outputs[0]->host<int8_t>();
MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
QuanPrePostParameters params;
params.maxValue = mMaxMinValue[0];
params.minValue = mMaxMinValue[1];
params.inputScale = mInpScale.data();
params.outputScale = mOutScale.data();
params.inputZeroPoint = mInpZero.data();
params.outputZeroPoint = mOutZero.data();
const int8_t* inner_input = input + tId * mInnerSize;
int8_t* inner_output = output + tId * mInnerSize;
core->MNNNormInt8(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize, &params);
}
MNN_CONCURRENCY_END();
return NO_ERROR;
}
const float* input = inputs.at(0)->host<float>(); const float* input = inputs.at(0)->host<float>();
float* output = outputs.at(0)->host<float>(); float* output = outputs.at(0)->host<float>();
MNN_CONCURRENCY_BEGIN(tId, outter_size_) { MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
const float* inner_input = input + tId * inner_size_; const float* inner_input = input + tId * mInnerSize;
float* inner_output = output + tId * inner_size_; float* inner_output = output + tId * mInnerSize;
MNNNorm(inner_output, inner_input, gamma, beta, epsilon_, inner_size_); MNNNorm(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize);
} }
MNN_CONCURRENCY_END(); MNN_CONCURRENCY_END();
return NO_ERROR; return NO_ERROR;
@ -104,40 +99,53 @@ ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs, ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
const std::vector<Tensor*> &outputs) { const std::vector<Tensor*> &outputs) {
outter_size_ = 1; mOutterSize = 1;
inner_size_ = 1; mInnerSize = 1;
int rank = inputs.at(0)->dimensions(); int rank = inputs.at(0)->dimensions();
if (group_ > 1) { if (mGroup > 1) {
outter_size_ = inputs.at(0)->length(0) * group_; mOutterSize = inputs.at(0)->length(0) * mGroup;
for (int i = 1; i < rank; i++) { for (int i = 1; i < rank; i++) {
inner_size_ *= inputs.at(0)->length(i); mInnerSize *= inputs.at(0)->length(i);
} }
inner_size_ /= group_; mInnerSize /= mGroup;
return NO_ERROR; return NO_ERROR;
} }
for (int i = 0; i < rank - axis_size; ++i) { for (int i = 0; i < rank - mAxis; ++i) {
outter_size_ *= inputs.at(0)->length(i); mOutterSize *= inputs.at(0)->length(i);
} }
for (int i = rank - axis_size; i < rank; ++i) { for (int i = rank - mAxis; i < rank; ++i) {
inner_size_ *= inputs.at(0)->length(i); mInnerSize *= inputs.at(0)->length(i);
}
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
mInpZero.resize(1);
mOutZero.resize(1);
mInpScale.resize(1);
mOutScale.resize(1);
mMaxMinValue.resize(2);
auto inpQuantAttr = TensorUtils::getDescribe(inputs[0])->quantAttr;
auto outQuantAttr = TensorUtils::getDescribe(outputs[0])->quantAttr;
mInpZero[0] = inpQuantAttr->zero;
mOutZero[0] = outQuantAttr->zero;
mInpScale[0] = inpQuantAttr->scale;
mOutScale[0] = outQuantAttr->scale == 0.f? 0.f : 1.0f / outQuantAttr->scale;
mMaxMinValue[0] = outQuantAttr->max;
mMaxMinValue[1] = outQuantAttr->min;
} }
return NO_ERROR; return NO_ERROR;
} }
CPULayerNorm::~CPULayerNorm() { CPULayerNorm::~CPULayerNorm() {
if (gamma_.get()) { if (mGamma.get()) {
backend()->onReleaseBuffer(gamma_.get(), Backend::STATIC); backend()->onReleaseBuffer(mGamma.get(), Backend::STATIC);
} }
if (beta_.get()) { if (mBeta.get()) {
backend()->onReleaseBuffer(beta_.get(), Backend::STATIC); backend()->onReleaseBuffer(mBeta.get(), Backend::STATIC);
} }
} }
class CPULayerNormCreator : public CPUBackend::Creator { class CPULayerNormCreator : public CPUBackend::Creator {
public: public:
Execution* onCreate(const std::vector<Tensor*>& inputs, Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override {
const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
return new CPULayerNorm(op, backend); return new CPULayerNorm(op, backend);
} }
}; };

View File

@ -0,0 +1,41 @@
//
// CPULayerNorm.hpp
// MNN
//
// Created by MNN on 2023/07/11
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef CPULayerNorm_hpp
#define CPULayerNorm_hpp
#include "core/Execution.hpp"
#include "core/Macro.h"
namespace MNN {
class CPULayerNorm : public Execution {
public:
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
virtual ~CPULayerNorm();
ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
private:
bool allocGammaBeta(int size);
private:
int mAxis = 0;
int mInnerSize = 1;
int mOutterSize = 1;
int mGroup = 1;
float mEpsilon = 0.001;
std::unique_ptr<Tensor> mGamma;
std::unique_ptr<Tensor> mBeta;
bool mIniGammaBeta = false;
// LayerNormInt8 parameters.
std::vector<float> mInpScale;
std::vector<float> mOutScale;
std::vector<ssize_t> mInpZero;
std::vector<ssize_t> mOutZero;
std::vector<ssize_t> mMaxMinValue;
};
} // namespace MNN
#endif /* CPULayerNorm_hpp */

View File

@ -14,6 +14,7 @@
#include "core/Macro.h" #include "core/Macro.h"
#include "core/Concurrency.h" #include "core/Concurrency.h"
#include "core/BufferAllocator.hpp" #include "core/BufferAllocator.hpp"
#include "core/TensorUtils.hpp"
#include "math/Vec.hpp" #include "math/Vec.hpp"
@ -94,40 +95,36 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes); auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes);
auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes); auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes);
auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes); auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes);
if (nullptr == ATPtrAlloc.first || nullptr == BTPtrAlloc.first || nullptr == CTPtrAlloc.first) { if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
auto BTPtr = (uint8_t*)BTPtrAlloc.first + BTPtrAlloc.second;
auto ATPtr = (uint8_t*)ATPtrAlloc.first + ATPtrAlloc.second;
auto CTPtr = (uint8_t*)CTPtrAlloc.first + CTPtrAlloc.second;
float* BTempPtr = (float*)BTPtr;
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1; int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) { mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB); core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB);
} , 1)); } , 1));
if (mTransposeA) { if (mTransposeA) {
// l, e -> lC4, e, 4 // l, e -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) { mPreFunctions.emplace_back(std::make_pair([ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
int offset[] = { int offset[] = {
e, e e, e
}; };
core->MNNPackCUnit((float*)ATPtr, APtr, e, l, offset); core->MNNPackCUnit((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
}, 1)); }, 1));
} else { } else {
// e, l -> lC4, e, 4 // e, l -> lC4, e, 4
mPreFunctions.emplace_back(std::make_pair( mPreFunctions.emplace_back(std::make_pair(
[ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) { [ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
int offset[] = { int offset[] = {
e, e e, e
}; };
core->MNNPackCUnitTranspose((float*)ATPtr, APtr, e, l, offset); core->MNNPackCUnitTranspose((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
}, 1)); }, 1));
} }
bool useBias = false; bool useBias = false;
uint8_t* biasPtr = nullptr;
std::vector<float> postParameters; std::vector<float> postParameters;
std::pair<void*, int> bdestAlloc = std::make_pair(nullptr, 0); MemChunk bdestAlloc;
bool bdestNeedFree = false;
if (inputs.size() > 2) { if (inputs.size() > 2) {
auto bias = inputs[2]; auto bias = inputs[2];
useBias = true; useBias = true;
@ -136,19 +133,20 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
mStrassenUseBiasDirectly = false; mStrassenUseBiasDirectly = false;
// Padding to align of 4 // Padding to align of 4
bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes); bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
if (bdestAlloc.first == nullptr) { bdestNeedFree = true;
if (bdestAlloc.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
auto bdest = (float*)((uint8_t*)bdestAlloc.first + bdestAlloc.second);
mPreFunctions.emplace_back(std::make_pair( mPreFunctions.emplace_back(std::make_pair(
[biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) { [biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack); ::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
::memcpy(bdest, borigin, biasLength * core->bytes); ::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes);
}, 1)); }, 1));
biasPtr = (uint8_t*)bdest;
} else { } else {
mStrassenUseBiasDirectly = true; mStrassenUseBiasDirectly = true;
biasPtr = bias->host<uint8_t>(); if (TensorUtils::getDescribe(bias)->mem.get()) {
bdestAlloc = TensorUtils::getDescribe(bias)->mem->chunk();
}
} }
postParameters = { postParameters = {
1.0f, 1.0f,
@ -157,29 +155,29 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
std::numeric_limits<float>().max(), std::numeric_limits<float>().max(),
}; };
} }
auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtr, BTPtr, CTPtr, useBias, biasPtr, postParameters); auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtrAlloc, BTPtrAlloc, CTPtrAlloc, useBias, bdestAlloc, postParameters);
if (NO_ERROR != code) { if (NO_ERROR != code) {
return code; return code;
} }
if (bdestAlloc.first != nullptr) { if (bdestNeedFree) {
bufferAlloc->free(bdestAlloc); bufferAlloc->free(bdestAlloc);
} }
// hC4, e, 4 -> e, h // hC4, e, 4 -> e, h
if (mTransposeC) { if (mTransposeC) {
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core]( mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) { int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
int offset[] = { int offset[] = {
e, e e, e
}; };
core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtr, e, h, offset); core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
}, 1)); }, 1));
} else { } else {
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core]( mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) { int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
int offset[] = { int offset[] = {
e, e e, e
}; };
core->MNNUnpackCUnit(CPtr, (float*)CTPtr, e, h, offset); core->MNNUnpackCUnit(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
}, 1)); }, 1));
} }
bufferAlloc->free(ATPtrAlloc); bufferAlloc->free(ATPtrAlloc);

View File

@ -55,8 +55,6 @@ public:
padWidth = padHeight = 0; padWidth = padHeight = 0;
} }
auto totalDepth = input->batch() * UP_DIV(input->channel(), core->pack); auto totalDepth = input->batch() * UP_DIV(input->channel(), core->pack);
auto inputData = input->host<uint8_t>();
auto outputData = output->host<uint8_t>();
auto inputPlaneStride = core->pack * input->width() * input->height(); auto inputPlaneStride = core->pack * input->width() * input->height();
auto outputPlaneStride = core->pack * output->width() * output->height(); auto outputPlaneStride = core->pack * output->width() * output->height();
int threadNumber = ((CPUBackend *)backend())->threadNumber(); int threadNumber = ((CPUBackend *)backend())->threadNumber();
@ -67,6 +65,8 @@ public:
} }
mFunction = std::make_pair(threadNumber, [=](int tId) { mFunction = std::make_pair(threadNumber, [=](int tId) {
for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) { for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
auto inputData = input->host<uint8_t>();
auto outputData = output->host<uint8_t>();
// run // run
mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(), mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(),
outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth, outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth,

View File

@ -11,6 +11,7 @@
#include "backend/cpu/CPUBackend.hpp" #include "backend/cpu/CPUBackend.hpp"
#include "core/Concurrency.h" #include "core/Concurrency.h"
#include "CPUTensorConvert.hpp" #include "CPUTensorConvert.hpp"
#include "core/TensorUtils.hpp"
//#define MNN_OPEN_TIME_TRACE //#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp> #include <MNN/AutoTime.hpp>
namespace MNN { namespace MNN {
@ -101,131 +102,129 @@ static void pickBoxes(const std::vector<score_box_t> &boxes, std::vector<long> &
} }
ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
// score transform space auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
auto &score = inputs[0]; mScoreBuffer = bufferAlloc->alloc(TensorUtils::getRawSize(inputs[0]) * inputs[0]->getType().bytes());
memcpy(mScore.buffer().dim, score->buffer().dim, sizeof(halide_dimension_t) * score->buffer().dimensions); if (mScoreBuffer.invalid()) {
backend()->onAcquireBuffer(&mScore, Backend::DYNAMIC); return OUT_OF_MEMORY;
}
// release temp buffer space // release temp buffer space
backend()->onReleaseBuffer(&mScore, Backend::DYNAMIC); bufferAlloc->free(mScoreBuffer);
return NO_ERROR;
}
auto &imInfo = inputs[2]; ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
// score transform space
auto score = inputs[0];
auto boxes = inputs[1];
auto imInfo = inputs[2];
auto featStride = mProposal->featStride(); auto featStride = mProposal->featStride();
auto preNmsTopN = mProposal->preNmsTopN(); auto preNmsTopN = mProposal->preNmsTopN();
auto nmsThreshold = mProposal->nmsThreshold(); auto nmsThreshold = mProposal->nmsThreshold();
auto afterNmsTopN = mProposal->afterNmsTopN(); auto afterNmsTopN = mProposal->afterNmsTopN();
auto minSize = mProposal->minSize(); auto minSize = mProposal->minSize();
auto boxes = inputs[1]; float* tmpScorePtr = (float*)mScoreBuffer.ptr();
// download
MNNUnpackC4Origin(tmpScorePtr, score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
mRun = [=]() { auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight;
// download auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight;
MNNUnpackC4Origin(mScore.host<float>(), score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height()); auto imH = imInfo->host<float>()[0]; // NC/4HW4
auto imW = imInfo->host<float>()[1]; // NC/4HW4
auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight; // generate proposals from box deltas and shifted anchors
auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight; // remove predicted boxes with either height or width < threshold
auto imH = imInfo->host<float>()[0]; // NC/4HW4 auto anchorWidth = 4;
auto imW = imInfo->host<float>()[1]; // NC/4HW4 auto anchorHeight = mAnchors.size() / 4;
std::vector<score_box_t> proposalBoxes;
float imScale = imInfo->host<float>()[2]; // NC/4HW4
float minBoxSize = minSize * imScale;
proposalBoxes.reserve(boxSize * anchorHeight);
// generate proposals from box deltas and shifted anchors {
// remove predicted boxes with either height or width < threshold for (int ah = 0; ah < anchorHeight; ++ah) {
auto anchorWidth = 4; auto boxPtr = boxes->host<float>() + ah * 4 * boxSize;
auto anchorHeight = mAnchors.size() / 4; auto scorePtr = tmpScorePtr + (ah + anchorHeight) * scrSize;
std::vector<score_box_t> proposalBoxes;
float imScale = imInfo->host<float>()[2]; // NC/4HW4
float minBoxSize = minSize * imScale;
proposalBoxes.reserve(boxSize * anchorHeight);
{ // shifted anchor
for (int ah = 0; ah < anchorHeight; ++ah) { const auto anchor = mAnchors.get() + ah * anchorWidth;
auto boxPtr = boxes->host<float>() + ah * 4 * boxSize; float anchorY = anchor[1];
auto scorePtr = mScore.host<float>() + (ah + anchorHeight) * scrSize; float anchorW = anchor[2] - anchor[0];
float anchorH = anchor[3] - anchor[1];
// shifted anchor for (int sh = 0; sh < scrHeight; sh++) {
const auto anchor = mAnchors.get() + ah * anchorWidth; float anchorX = anchor[0];
float anchorY = anchor[1]; auto boxPtrH = boxPtr + sh * 4 * boxWidth;
float anchorW = anchor[2] - anchor[0];
float anchorH = anchor[3] - anchor[1];
for (int sh = 0; sh < scrHeight; sh++) { for (int sw = 0; sw < scrWidth; sw++) {
float anchorX = anchor[0]; auto box = boxPtrH + 4 * sw;
auto boxPtrH = boxPtr + sh * 4 * boxWidth; // apply center size
float cx = anchorX + anchorW * 0.5f + anchorW * box[0];
float cy = anchorY + anchorH * 0.5f + anchorH * box[1];
float w = anchorW * exp(box[2]);
float h = anchorH * exp(box[3]);
for (int sw = 0; sw < scrWidth; sw++) { float minX = std::max(std::min(cx - w * 0.5f, imW - 1), 0.f);
auto box = boxPtrH + 4 * sw; float minY = std::max(std::min(cy - h * 0.5f, imH - 1), 0.f);
// apply center size float maxX = std::max(std::min(cx + w * 0.5f, imW - 1), 0.f);
float cx = anchorX + anchorW * 0.5f + anchorW * box[0]; float maxY = std::max(std::min(cy + h * 0.5f, imH - 1), 0.f);
float cy = anchorY + anchorH * 0.5f + anchorH * box[1]; if (maxX - minX + 1 >= minBoxSize && maxY - minY + 1 >= minBoxSize) {
float w = anchorW * exp(box[2]); proposalBoxes.emplace_back(box_rect(minX, minY, maxX, maxY, scorePtr[sh * scrWidth + sw]));
float h = anchorH * exp(box[3]);
float minX = std::max(std::min(cx - w * 0.5f, imW - 1), 0.f);
float minY = std::max(std::min(cy - h * 0.5f, imH - 1), 0.f);
float maxX = std::max(std::min(cx + w * 0.5f, imW - 1), 0.f);
float maxY = std::max(std::min(cy + h * 0.5f, imH - 1), 0.f);
if (maxX - minX + 1 >= minBoxSize && maxY - minY + 1 >= minBoxSize) {
proposalBoxes.emplace_back(box_rect(minX, minY, maxX, maxY, scorePtr[sh * scrWidth + sw]));
}
anchorX += featStride;
} }
anchorY += featStride; anchorX += featStride;
} }
anchorY += featStride;
} }
} }
}
{ {
// sort all (proposal, score) pairs by score from highest to lowest // sort all (proposal, score) pairs by score from highest to lowest
// take top preNmsTopN // take top preNmsTopN
auto compareFunction = [](const score_box_t &a, const score_box_t &b) { auto compareFunction = [](const score_box_t &a, const score_box_t &b) {
return box_score(a) > box_score(b); return box_score(a) > box_score(b);
}; };
if (0 < preNmsTopN && preNmsTopN < (int)proposalBoxes.size()) { if (0 < preNmsTopN && preNmsTopN < (int)proposalBoxes.size()) {
std::partial_sort(proposalBoxes.begin(), proposalBoxes.begin() + preNmsTopN, proposalBoxes.end(), std::partial_sort(proposalBoxes.begin(), proposalBoxes.begin() + preNmsTopN, proposalBoxes.end(),
compareFunction); compareFunction);
proposalBoxes.resize(preNmsTopN); proposalBoxes.resize(preNmsTopN);
} else { } else {
std::sort(proposalBoxes.begin(), proposalBoxes.end(), compareFunction); std::sort(proposalBoxes.begin(), proposalBoxes.end(), compareFunction);
}
} }
}
// apply nms with nmsThreshold // apply nms with nmsThreshold
// take afterNmsTopN // take afterNmsTopN
std::vector<long> picked; std::vector<long> picked;
picked.reserve(afterNmsTopN); picked.reserve(afterNmsTopN);
{ {
pickBoxes(proposalBoxes, picked, nmsThreshold, afterNmsTopN); pickBoxes(proposalBoxes, picked, nmsThreshold, afterNmsTopN);
}
int pickedCount = std::min((int)picked.size(), afterNmsTopN);
// return the top proposals
int roiStep = outputs[0]->buffer().dim[0].stride, scoreStep = 0;
auto roiPtr = outputs[0]->host<float>(), scoresPtr = (float *)NULL;
memset(roiPtr, 0, outputs[0]->size());
if (outputs.size() > 1) {
scoreStep = outputs[1]->buffer().dim[0].stride;
scoresPtr = outputs[1]->host<float>();
memset(scoresPtr, 0, outputs[1]->size());
}
for (int i = 0; i < pickedCount; i++, scoresPtr += scoreStep) {
auto box = proposalBoxes[picked[i]];
roiPtr[i * 4 + 0] = 0;
roiPtr[i * 4 + 1] = box_rect_xmin(box);
roiPtr[i * 4 + 2] = box_rect_ymin(box);
roiPtr[i * 4 + 3] = box_rect_xmax(box);
roiPtr[i * 4 + outputs[0]->length(0) * 4] = box_rect_ymax(box);
if (scoresPtr) {
scoresPtr[0] = box_score(box);
} }
}
int pickedCount = std::min((int)picked.size(), afterNmsTopN);
// return the top proposals
int roiStep = outputs[0]->buffer().dim[0].stride, scoreStep = 0;
auto roiPtr = outputs[0]->host<float>(), scoresPtr = (float *)NULL;
memset(roiPtr, 0, outputs[0]->size());
if (outputs.size() > 1) {
scoreStep = outputs[1]->buffer().dim[0].stride;
scoresPtr = outputs[1]->host<float>();
memset(scoresPtr, 0, outputs[1]->size());
}
for (int i = 0; i < pickedCount; i++, scoresPtr += scoreStep) {
auto box = proposalBoxes[picked[i]];
roiPtr[i * 4 + 0] = 0;
roiPtr[i * 4 + 1] = box_rect_xmin(box);
roiPtr[i * 4 + 2] = box_rect_ymin(box);
roiPtr[i * 4 + 3] = box_rect_xmax(box);
roiPtr[i * 4 + outputs[0]->length(0) * 4] = box_rect_ymax(box);
if (scoresPtr) {
scoresPtr[0] = box_score(box);
}
}
};
return NO_ERROR;
}
ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
mRun();
return NO_ERROR; return NO_ERROR;
} }

View File

@ -12,6 +12,7 @@
#include <functional> #include <functional>
#include "core/AutoStorage.h" #include "core/AutoStorage.h"
#include "core/Execution.hpp" #include "core/Execution.hpp"
#include "core/BufferAllocator.hpp"
#include "MNN_generated.h" #include "MNN_generated.h"
namespace MNN { namespace MNN {
@ -26,8 +27,7 @@ public:
private: private:
const Proposal *mProposal; const Proposal *mProposal;
AutoStorage<float> mAnchors; AutoStorage<float> mAnchors;
Tensor mScore; MemChunk mScoreBuffer;
std::function<void()> mRun;
}; };
} // namespace MNN } // namespace MNN

View File

@ -68,7 +68,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
} }
Tensor::InsideDescribe::Region newRegion; Tensor::InsideDescribe::Region newRegion;
OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true); OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
mFastBlit.emplace_back(std::make_pair(slice.origin->host<void>(), std::move(newRegion))); mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
} }
return NO_ERROR; return NO_ERROR;
} }
@ -98,12 +98,12 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
for (int i=0; i< des->regions.size(); ++i) { for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i]; auto& slice = des->regions[i];
auto origin = slice.origin; auto origin = slice.origin;
if (nullptr == origin || nullptr == origin->host<void>()) { if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
continue; continue;
} }
// if tensor is not NC4HW4 or has been merged, don't need deal // if tensor is not NC4HW4 or has been merged, don't need deal
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), &slice)); mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
continue; continue;
} }
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it // if NC4HW4's C%4 == 0, change convert to transpose and fuse it
@ -132,12 +132,13 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice); bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
if (merge) { if (merge) {
// cache the merged tensor // cache the merged tensor
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), newSlice.get())); mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
mCacheRegions.emplace_back(newSlice); mCacheRegions.emplace_back(newSlice);
continue; continue;
} }
} }
auto cache = static_cast<CPUBackend*>(backend())->getCache(); auto cache = static_cast<CPUBackend*>(backend())->getCache();
#if 1
auto tempTensor = cache->findCacheTensor(origin, midFormat); auto tempTensor = cache->findCacheTensor(origin, midFormat);
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4); //MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
if (nullptr == tempTensor) { if (nullptr == tempTensor) {
@ -159,7 +160,23 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) { if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
forRelease.emplace_back(tempTensor); forRelease.emplace_back(tempTensor);
} }
mTempInputCopy.emplace_back(std::make_pair(tempTensor->host<void>(), &slice)); #else
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor.get()));
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto tempTensor = newTensor.get();
backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
cache->pushCacheTensor(newTensor, origin, midFormat);
#endif
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
} }
for (auto t : forRelease) { for (auto t : forRelease) {
backend()->onReleaseBuffer(t, Backend::DYNAMIC); backend()->onReleaseBuffer(t, Backend::DYNAMIC);
@ -175,7 +192,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
if (region->size[0] * region->size[1] * region->size[2] < thredHold) { if (region->size[0] * region->size[1] * region->size[2] < thredHold) {
return NO_ERROR; return NO_ERROR;
} }
auto ptr = mTempInputCopy[0].first; auto tensorPtr = mTempInputCopy[0].first;
int pos = -1; int pos = -1;
for (int i=0; i<3; ++i) { for (int i=0; i<3; ++i) {
if (region->size[i] > 1) { if (region->size[i] > 1) {
@ -212,7 +229,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
for (int v=pos+1; v<3; ++v) { for (int v=pos+1; v<3; ++v) {
cacheReg.size[v] = region->size[v]; cacheReg.size[v] = region->size[v];
} }
mTempInputCopy.emplace_back(std::make_pair(ptr, cacheRegPtr.get())); mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
mCacheRegions.emplace_back(cacheRegPtr); mCacheRegions.emplace_back(cacheRegPtr);
} }
} }
@ -318,7 +335,7 @@ void CPURaster::executeFaster(const std::vector<Tensor *> &inputs, const std::ve
auto& iter = mFastBlit[u]; auto& iter = mFastBlit[u];
auto& slice = iter.second; auto& slice = iter.second;
//Offset use byte //Offset use byte
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes; auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes; auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) { if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
for (int z=0; z<slice.size[0]; ++z) { for (int z=0; z<slice.size[0]; ++z) {
@ -543,6 +560,11 @@ void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) {
ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) { ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
if (nullptr != mTempOutput) {
mOutputPtr = mTempOutput->host<void>();
} else {
mOutputPtr = outputs[0]->host<void>();
}
if (mFast) { if (mFast) {
executeFaster(____inputs, outputs); executeFaster(____inputs, outputs);
return NO_ERROR; return NO_ERROR;
@ -607,7 +629,7 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) { for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
auto& iter = mTempInputCopy[u]; auto& iter = mTempInputCopy[u];
auto& slice = *(iter.second); auto& slice = *(iter.second);
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes; auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes; auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
_blit(slice, bytes, srcPtr, dstPtr, proc); _blit(slice, bytes, srcPtr, dstPtr, proc);
} }
@ -752,13 +774,12 @@ public:
} }
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber(); auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) { if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
auto buffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize)); mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
if (nullptr == buffer.first) { if (mCacheBuffer.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
mCacheBuffer = (uint8_t*)buffer.first + buffer.second;
mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize; mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(buffer); static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
} }
return NO_ERROR; return NO_ERROR;
} }
@ -887,7 +908,7 @@ public:
auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]]; auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
auto dst = dstOrigin; auto dst = dstOrigin;
if (cmd->fuse() >= 0) { if (cmd->fuse() >= 0) {
dst = fuseBuffer; dst = fuseBuffer.ptr();
} }
do { do {
if (OpType_UnaryOp == op->type()) { if (OpType_UnaryOp == op->type()) {
@ -921,7 +942,7 @@ public:
} }
} else { } else {
// Blit to cache // Blit to cache
auto srcCache = mCacheBuffer + mMaxCacheSize * tId; auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
for (int z=0; z<cmd->size()->data()[0]; ++z) { for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes; auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
auto dstZ = dst + z * outputStride[0] * bytes; auto dstZ = dst + z * outputStride[0] * bytes;
@ -978,7 +999,7 @@ public:
} }
} }
} else { } else {
auto cache0 = mCacheBuffer + mMaxCacheSize * tId; auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
auto cache1 = cache0 + cmd->size()->data()[2] * bytes; auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
for (int z=0; z<cmd->size()->data()[0]; ++z) { for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto src0Z = src0 + z * stride1[0] * bytes; auto src0Z = src0 + z * stride1[0] * bytes;
@ -1080,9 +1101,8 @@ private:
const LoopParam* mLoop; const LoopParam* mLoop;
std::vector<Tensor*> mStack; std::vector<Tensor*> mStack;
std::vector<ThreadContainer> mContainer; std::vector<ThreadContainer> mContainer;
uint8_t* mCacheBuffer = nullptr; MemChunk mCacheBuffer, mFuseBuffer;
int mMaxCacheSize = 0; int mMaxCacheSize = 0;
uint8_t* mFuseBuffer = nullptr;
int mMaxFuseBufferSize = 0; int mMaxFuseBufferSize = 0;
}; };

View File

@ -28,8 +28,8 @@ public:
void tensorConvert(Tensor* input, Tensor* output, int bytes); void tensorConvert(Tensor* input, Tensor* output, int bytes);
private: private:
std::map<Tensor*, Tensor*> mTempInput; std::map<Tensor*, Tensor*> mTempInput;
std::vector<std::pair<void*, Tensor::InsideDescribe::Region*>> mTempInputCopy; std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
std::vector<std::pair<void*, Tensor::InsideDescribe::Region>> mFastBlit; std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
std::shared_ptr<Tensor> mTempOutput; std::shared_ptr<Tensor> mTempOutput;
void* mOutputPtr; void* mOutputPtr;
bool mNeedZero = false; bool mNeedZero = false;

View File

@ -1,4 +1,6 @@
#include "CPUResizeCache.hpp" #include "CPUResizeCache.hpp"
#include "../../core/TensorUtils.hpp"
namespace MNN { namespace MNN {
Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const { Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const {
auto iter = mFormatCache.find(std::make_pair(src, format)); auto iter = mFormatCache.find(std::make_pair(src, format));
@ -14,5 +16,9 @@ void CPUResizeCache::pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor*
void CPUResizeCache::reset() { void CPUResizeCache::reset() {
mFormatCache.clear(); mFormatCache.clear();
} }
void CPUResizeCache::release() {
for (auto iter : mFormatCache) {
TensorUtils::getDescribe(iter.second.get())->mem.reset(nullptr);
}
}
}; };

View File

@ -19,6 +19,7 @@ public:
// Return cache tensor // Return cache tensor
void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format); void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format);
void reset(); void reset();
void release();
private: private:
std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache; std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
}; };

View File

@ -647,7 +647,7 @@ L1Loop:
ld1 {v4.8b}, [x1], #8 // src: k:6,7 ld1 {v4.8b}, [x1], #8 // src: k:6,7
ld1 {v4.s}[2], [x1] ld1 {v4.s}[2], [x1]
mov v9.4s, v16.4s mov v9.16b, v16.16b
sxtl2 v6.8h, v4.16b sxtl2 v6.8h, v4.16b
tbl v7.16b, {v2.16b, v3.16b}, v24.16b // src0 tbl v7.16b, {v2.16b, v3.16b}, v24.16b // src0

View File

@ -84,14 +84,14 @@ LoopE8:
sxtl2 v11.4s, v1.8h sxtl2 v11.4s, v1.8h
scvtf v0.4s, v8.4s scvtf v0.4s, v8.4s
scvtf v1.4s, v9.4s scvtf v1.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v0.4s, v12.4s fmla v8.4s, v0.4s, v12.4s
fmla v9.4s, v1.4s, v13.4s fmla v9.4s, v1.4s, v13.4s
scvtf v0.4s, v10.4s scvtf v0.4s, v10.4s
scvtf v1.4s, v11.4s scvtf v1.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v0.4s, v12.4s fmla v10.4s, v0.4s, v12.4s
fmla v11.4s, v1.4s, v13.4s fmla v11.4s, v1.4s, v13.4s
ld1 {v0.4s, v1.4s}, [x15], x11 ld1 {v0.4s, v1.4s}, [x15], x11
@ -153,14 +153,14 @@ LoopE8:
sxtl2 v11.4s, v1.8h sxtl2 v11.4s, v1.8h
scvtf v0.4s, v8.4s scvtf v0.4s, v8.4s
scvtf v1.4s, v9.4s scvtf v1.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v0.4s, v12.4s fmla v8.4s, v0.4s, v12.4s
fmla v9.4s, v1.4s, v13.4s fmla v9.4s, v1.4s, v13.4s
scvtf v0.4s, v10.4s scvtf v0.4s, v10.4s
scvtf v1.4s, v11.4s scvtf v1.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v0.4s, v12.4s fmla v10.4s, v0.4s, v12.4s
fmla v11.4s, v1.4s, v13.4s fmla v11.4s, v1.4s, v13.4s
ld1 {v0.4s, v1.4s}, [x15], x11 ld1 {v0.4s, v1.4s}, [x15], x11
@ -321,14 +321,14 @@ LoopE8:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -405,14 +405,14 @@ LoopE8:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -564,14 +564,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
@ -616,14 +616,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
@ -721,7 +721,7 @@ blt E1
mvni v9.4s, #6 mvni v9.4s, #6
add v3.4s, v3.4s, v9.4s add v3.4s, v3.4s, v9.4s
scvtf v3.4s, v3.4s scvtf v3.4s, v3.4s
mov v4.4s, v2.4s mov v4.16b, v2.16b
fmla v4.4s, v3.4s, v1.4s fmla v4.4s, v3.4s, v1.4s
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
@ -756,16 +756,16 @@ blt E1
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
ld1 {v1.4s}, [x15], x11 ld1 {v1.4s}, [x15], x11
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
ld1 {v2.4s}, [x15], x11 ld1 {v2.4s}, [x15], x11
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
ld1 {v3.4s}, [x15], x11 ld1 {v3.4s}, [x15], x11
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -810,7 +810,7 @@ blt E1
mvni v9.4s, #6 mvni v9.4s, #6
add v3.4s, v3.4s, v9.4s add v3.4s, v3.4s, v9.4s
scvtf v3.4s, v3.4s scvtf v3.4s, v3.4s
mov v4.4s, v2.4s mov v4.16b, v2.16b
fmla v4.4s, v3.4s, v1.4s fmla v4.4s, v3.4s, v1.4s
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
@ -840,14 +840,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11 // ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
@ -953,14 +953,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.s}[0], [x15], x11 ld1 {v0.s}[0], [x15], x11
@ -989,14 +989,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.s}[0], [x15], x11 ld1 {v0.s}[0], [x15], x11
@ -1059,14 +1059,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -1102,14 +1102,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v15.4s mov v8.16b, v15.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v15.4s mov v10.16b, v15.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s

View File

@ -74,14 +74,14 @@ LoopE8:
sxtl2 v11.4s, v1.8h sxtl2 v11.4s, v1.8h
scvtf v0.4s, v8.4s scvtf v0.4s, v8.4s
scvtf v1.4s, v9.4s scvtf v1.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v0.4s, v12.4s fmla v8.4s, v0.4s, v12.4s
fmla v9.4s, v1.4s, v13.4s fmla v9.4s, v1.4s, v13.4s
scvtf v0.4s, v10.4s scvtf v0.4s, v10.4s
scvtf v1.4s, v11.4s scvtf v1.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v0.4s, v12.4s fmla v10.4s, v0.4s, v12.4s
fmla v11.4s, v1.4s, v13.4s fmla v11.4s, v1.4s, v13.4s
ld1 {v0.4s, v1.4s}, [x15], x11 ld1 {v0.4s, v1.4s}, [x15], x11
@ -137,14 +137,14 @@ LoopE8:
sxtl2 v11.4s, v1.8h sxtl2 v11.4s, v1.8h
scvtf v0.4s, v8.4s scvtf v0.4s, v8.4s
scvtf v1.4s, v9.4s scvtf v1.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v0.4s, v12.4s fmla v8.4s, v0.4s, v12.4s
fmla v9.4s, v1.4s, v13.4s fmla v9.4s, v1.4s, v13.4s
scvtf v0.4s, v10.4s scvtf v0.4s, v10.4s
scvtf v1.4s, v11.4s scvtf v1.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v0.4s, v12.4s fmla v10.4s, v0.4s, v12.4s
fmla v11.4s, v1.4s, v13.4s fmla v11.4s, v1.4s, v13.4s
ld1 {v0.4s, v1.4s}, [x15], x11 ld1 {v0.4s, v1.4s}, [x15], x11
@ -294,14 +294,14 @@ LoopE8:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -371,14 +371,14 @@ LoopE8:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -520,14 +520,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0] // st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
@ -567,14 +567,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
@ -669,16 +669,16 @@ blt E1
ld1 {v0.4s}, [x15], x11 ld1 {v0.4s}, [x15], x11
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
ld1 {v1.4s}, [x15], x11 ld1 {v1.4s}, [x15], x11
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
ld1 {v2.4s}, [x15], x11 ld1 {v2.4s}, [x15], x11
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
ld1 {v3.4s}, [x15], x11 ld1 {v3.4s}, [x15], x11
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -717,14 +717,14 @@ blt E1
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11 // ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
@ -819,14 +819,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.s}[0], [x15], x11 ld1 {v0.s}[0], [x15], x11
@ -849,14 +849,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v26.4s mov v8.16b, v26.16b
mov v9.4s, v27.4s mov v9.16b, v27.16b
fmla v8.4s, v12.4s, v24.4s fmla v8.4s, v12.4s, v24.4s
fmla v9.4s, v13.4s, v25.4s fmla v9.4s, v13.4s, v25.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v26.4s mov v10.16b, v26.16b
mov v11.4s, v27.4s mov v11.16b, v27.16b
fmla v10.4s, v12.4s, v24.4s fmla v10.4s, v12.4s, v24.4s
fmla v11.4s, v13.4s, v25.4s fmla v11.4s, v13.4s, v25.4s
ld1 {v0.s}[0], [x15], x11 ld1 {v0.s}[0], [x15], x11
@ -909,14 +909,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v14.4s mov v8.16b, v14.16b
mov v9.4s, v14.4s mov v9.16b, v14.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v14.4s mov v10.16b, v14.16b
mov v11.4s, v14.4s mov v11.16b, v14.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s
@ -944,14 +944,14 @@ LoopE1:
sxtl2 v11.4s, v12.8h sxtl2 v11.4s, v12.8h
scvtf v12.4s, v8.4s scvtf v12.4s, v8.4s
scvtf v13.4s, v9.4s scvtf v13.4s, v9.4s
mov v8.4s, v15.4s mov v8.16b, v15.16b
mov v9.4s, v15.4s mov v9.16b, v15.16b
fmla v8.4s, v12.4s, v4.4s fmla v8.4s, v12.4s, v4.4s
fmla v9.4s, v13.4s, v4.4s fmla v9.4s, v13.4s, v4.4s
scvtf v12.4s, v10.4s scvtf v12.4s, v10.4s
scvtf v13.4s, v11.4s scvtf v13.4s, v11.4s
mov v10.4s, v15.4s mov v10.16b, v15.16b
mov v11.4s, v15.4s mov v11.16b, v15.16b
fmla v10.4s, v12.4s, v4.4s fmla v10.4s, v12.4s, v4.4s
fmla v11.4s, v13.4s, v4.4s fmla v11.4s, v13.4s, v4.4s

View File

@ -68,9 +68,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -108,9 +108,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -164,9 +164,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -204,9 +204,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -386,8 +386,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
@ -428,8 +428,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
@ -483,8 +483,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48 ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
@ -520,8 +520,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s

View File

@ -59,9 +59,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -99,9 +99,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -145,9 +145,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -185,9 +185,9 @@ LoopH:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v0.4s, v1.4s scvtf v0.4s, v1.4s
scvtf v1.4s, v2.4s scvtf v1.4s, v2.4s
mov v2.4s, v7.4s mov v2.16b, v7.16b
fmla v2.4s, v1.4s, v5.4s fmla v2.4s, v1.4s, v5.4s
mov v1.4s, v6.4s mov v1.16b, v6.16b
fmla v1.4s, v0.4s, v4.4s fmla v1.4s, v0.4s, v4.4s
ld1 {v0.4s}, [x15], #16 ld1 {v0.4s}, [x15], #16
@ -357,8 +357,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
@ -399,8 +399,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
@ -448,8 +448,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48 ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
@ -485,8 +485,8 @@ LoopHRemain:
sxtl2 v2.4s, v0.8h sxtl2 v2.4s, v0.8h
scvtf v1.4s, v1.4s scvtf v1.4s, v1.4s
scvtf v2.4s, v2.4s scvtf v2.4s, v2.4s
mov v3.4s, v21.4s mov v3.16b, v21.16b
mov v4.4s, v21.4s mov v4.16b, v21.16b
fmla v3.4s, v1.4s, v20.4s fmla v3.4s, v1.4s, v20.4s
fmla v4.4s, v2.4s, v20.4s fmla v4.4s, v2.4s, v20.4s

View File

@ -187,7 +187,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) { if (mBlitInfo.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
bufferAlloc->free(mBlitInfo); bufferAlloc->free(mBlitInfo);
@ -236,7 +236,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount; auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
auto threadFunction = [&](int tId) { auto threadFunction = [&](int tId) {
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0); auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
int32_t info[4]; int32_t info[4];

View File

@ -31,7 +31,7 @@ protected:
std::shared_ptr<Tensor> mTempIm2ColBuffer; std::shared_ptr<Tensor> mTempIm2ColBuffer;
std::shared_ptr<CPUConvolution::ResourceInt8> mResource; std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
CPUConvolution::MutableResourceInt8 mMutableResource; CPUConvolution::MutableResourceInt8 mMutableResource;
std::pair<void*, int> mBlitInfo; MemChunk mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride; std::pair<size_t, size_t> mBlitInfoStride;
int mIm2ColCount; int mIm2ColCount;
}; };

View File

@ -193,8 +193,9 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
} }
for (auto& unit : mUnits) { for (auto& unit : mUnits) {
int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0); int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
auto srcData = input->host<float>() + (sy * iw + sx) * UNIT; auto srcChunk = TensorUtils::getDescribe(input)->mem->chunk() + (sy * iw + sx) * UNIT;
unit.input.reset(Tensor::create<float>({batch, ic, ih - sy, iw - sx}, srcData, Tensor::CAFFE_C4)); unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
TensorUtils::getDescribe(unit.input.get())->mem.reset(new CPUMemObj(nullptr, srcChunk, 0));
for (int i = 0; i < input->dimensions(); ++i) { for (int i = 0; i < input->dimensions(); ++i) {
unit.input->setStride(i, input->stride(i)); unit.input->setStride(i, input->stride(i));
} }
@ -296,6 +297,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]); core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
std::vector<Tensor*> tmp_outputs; std::vector<Tensor*> tmp_outputs;
for (auto& unit : mUnits) { for (auto& unit : mUnits) {
unit.input->buffer().host = TensorUtils::getDescribe(unit.input.get())->mem->chunk().ptr();
auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()}); auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
if (ret != NO_ERROR) { if (ret != NO_ERROR) {
return ret; return ret;

View File

@ -14,6 +14,7 @@
#include "ConvOpt.h" #include "ConvOpt.h"
#include "core/Macro.h" #include "core/Macro.h"
#include "CommonOptFunction.h" #include "CommonOptFunction.h"
#include "core/TensorUtils.hpp"
namespace MNN { namespace MNN {
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight, Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
@ -88,8 +89,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
auto matrixSizeE = output->height() * output->width() * input->batch(); auto matrixSizeE = output->height() * output->width() * input->batch();
auto outputPlane = output->height() * output->width(); auto outputPlane = output->height() * output->width();
mUnits.clear(); mUnits.clear();
auto inputPtr = input->host<uint8_t>(); auto inputPtr = TensorUtils::getDescribe(input)->mem->chunk();
auto outputPtr = output->host<uint8_t>(); auto outputPtr = TensorUtils::getDescribe(output)->mem->chunk();
std::shared_ptr<char> __autoFunction; std::shared_ptr<char> __autoFunction;
auto padY = mPadY; auto padY = mPadY;
auto padX = mPadX; auto padX = mPadX;
@ -124,9 +126,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
int l = ic; int l = ic;
int h = oc; int h = oc;
auto aPtr = inputPtr + core->pack * planeStart * bytes; auto aPtr = inputPtr + core->pack * planeStart * bytes;
auto bPtr = weightTensor->host<uint8_t>(); auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
auto cPtr = outputPtr + core->pack * planeStart * bytes; auto cPtr = outputPtr + core->pack * planeStart * bytes;
auto biasPtr = mResource->mBias->host<uint8_t>(); auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
memoryPool->beginGroup(); memoryPool->beginGroup();
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters); auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
if (NO_ERROR != code) { if (NO_ERROR != code) {
@ -168,9 +170,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
int l = ic; int l = ic;
int h = std::min(ocSize * core->pack, ocWeightSize * hPack); int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
auto aPtr = inputPtr; auto aPtr = inputPtr;
auto bPtr = mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes; auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes; auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
auto biasPtr = mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes; auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
memoryPool->beginGroup(); memoryPool->beginGroup();
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters); auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
if (NO_ERROR != code) { if (NO_ERROR != code) {

View File

@ -413,7 +413,6 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
if (!res) { if (!res) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
for (auto& unit : mComputeUnits) { for (auto& unit : mComputeUnits) {
backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC); backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
if (unit.winogradInfo.open) { if (unit.winogradInfo.open) {
@ -469,6 +468,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
auto srcOrigin = input->host<float>(); auto srcOrigin = input->host<float>();
auto dstOrigin = output->host<float>(); auto dstOrigin = output->host<float>();
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float)); ::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float));
auto threadFunction = [&](int threadId) { auto threadFunction = [&](int threadId) {
auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0); auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);

View File

@ -440,10 +440,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
int LRoundupC4 = UP_DIV(LRoundup, unit); int LRoundupC4 = UP_DIV(LRoundup, unit);
auto outputChannel = output->channel(); auto outputChannel = output->channel();
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr); ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
const float *biasPtr = nullptr;
if (inputs.size() > 2) { if (inputs.size() > 2) {
bias = inputs[2]; bias = inputs[2];
biasPtr = bias->host<float>();
} }
auto kernelSize = mCommon->kernelX() * mCommon->kernelY(); auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
@ -467,7 +465,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1; auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *))); auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
if (nullptr == tempPtr.first) { if (tempPtr.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC); backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@ -483,10 +481,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n", MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n",
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes); batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes);
#endif #endif
const float* biasPtr = bias ? bias->host<float>() : nullptr;
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0; auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second + auto srcPtr = (float const **)(tempPtr.ptr() + 0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
auto el = (int32_t *)(srcPtr + kernelSize * maxLine); auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
auto weightPtr = weight->host<uint8_t>(); auto weightPtr = weight->host<uint8_t>();
@ -614,10 +611,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes); batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes);
} }
#endif #endif
const float* biasPtr = bias ? bias->host<float>() : nullptr;
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId; auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second + auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
auto el = (int32_t *)(srcPtr + kernelSize * maxLine); auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
auto weightPtr = weight->host<float>(); auto weightPtr = weight->host<float>();
int32_t info[4]; int32_t info[4];

View File

@ -91,7 +91,7 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) { if (mBlitInfo.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
bufferAlloc->free(mBlitInfo); bufferAlloc->free(mBlitInfo);
@ -147,7 +147,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch; info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
info[2] = DST_XUNIT; info[2] = DST_XUNIT;
info[3] = mIm2ColParamter.strideX; info[3] = mIm2ColParamter.strideX;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) { for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {

View File

@ -31,7 +31,7 @@ protected:
ConvolutionCommon::Im2ColParameter mIm2ColParamter; ConvolutionCommon::Im2ColParameter mIm2ColParamter;
CPUConvolution::MutableResourceInt8 mMutableResource; CPUConvolution::MutableResourceInt8 mMutableResource;
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel; decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
std::pair<void*, int> mBlitInfo; MemChunk mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride; std::pair<size_t, size_t> mBlitInfoStride;
}; };
} // namespace MNN } // namespace MNN

View File

@ -130,7 +130,7 @@ ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, cons
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number); auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) { if (mBlitInfo.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
bufferAlloc->free(mBlitInfo); bufferAlloc->free(mBlitInfo);
@ -199,7 +199,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0); auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
auto threadFunction = [&](int tId) { auto threadFunction = [&](int tId) {
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride; auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
int32_t info[4]; int32_t info[4];

View File

@ -40,7 +40,7 @@ private:
std::vector<float> mPostParameters; std::vector<float> mPostParameters;
// mFakeBias used by GemmKernel // mFakeBias used by GemmKernel
std::shared_ptr<Tensor> mFakeBias; std::shared_ptr<Tensor> mFakeBias;
std::pair<void*, int> mBlitInfo; MemChunk mBlitInfo;
std::pair<size_t, size_t> mBlitInfoStride; std::pair<size_t, size_t> mBlitInfoStride;
}; };
} // namespace MNN } // namespace MNN

View File

@ -142,6 +142,55 @@ static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** s
} }
} }
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
#ifdef MNN_USE_SSE
uint8_t* srcPtr = (uint8_t*)src;
uint8_t* dstPtr = (uint8_t*)dst;
int offset = 128;
#else
const int8_t* srcPtr = src;
int8_t* dstPtr = dst;
int offset = 0;
#endif
int inpZero = static_cast<int>(params->inputZeroPoint[0]);
int outZero = static_cast<int>(params->outputZeroPoint[0]);
float inpScale = params->inputScale[0];
float outScale = params->outputScale[0];
float sum = 0.f;
int max_ = static_cast<int>(params->maxValue);
int min_ = static_cast<int>(params->minValue);
for (int j = 0; j < size; ++j) {
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
sum += fx;
}
float mean = sum / size;
float square_sum = 0.f;
for (int j = 0; j < size; ++j) {
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
square_sum += (fx - mean) * (fx - mean);
}
float variable = square_sum / size;
variable = 1.f / std::sqrt(variable + epsilon);
if (gamma && beta) {
for (int j = 0; j < size; ++j) {
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
float fy = (fx - mean) * variable * gamma[j] + beta[j];
int sy = fy * outScale + outZero;
sy = ALIMAX(min_, ALIMIN(sy, max_));
dstPtr[j] = sy + offset;
}
} else {
for (int j = 0; j < size; ++j) {
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
float fy = (fx - mean) * variable;
int sy = roundf(fy * outScale) + outZero;
sy = ALIMAX(min_, ALIMIN(sy, max_));
dstPtr[j] = sy + offset;
}
}
}
#ifndef MNN_USE_NEON #ifndef MNN_USE_NEON
void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) { void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
@ -2056,6 +2105,9 @@ void MNNCoreInt8FunctionInit() {
// pooling // pooling
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8; gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8; gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
// Norm
gCoreFunc->MNNNormInt8 = MNNNormInt8;
#if defined(__aarch64__) #if defined(__aarch64__)
auto core = MNNGetCoreFunctions(); auto core = MNNGetCoreFunctions();

View File

@ -68,6 +68,7 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast); void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast); void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4); void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
@ -102,7 +103,9 @@ struct CoreInt8Functions {
void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx); void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor); void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
// Norm
void (*MNNNormInt8)(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
}; };
void MNNCoreInt8FunctionInit(); void MNNCoreInt8FunctionInit();
CoreInt8Functions* MNNGetInt8CoreFunctions(); CoreInt8Functions* MNNGetInt8CoreFunctions();

View File

@ -144,7 +144,7 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums); auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first); mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
if (nullptr == mBlitInfo.first) { if (mBlitInfo.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
bufferAlloc->free(mBlitInfo); bufferAlloc->free(mBlitInfo);
@ -193,7 +193,7 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch; info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
info[2] = (int)mSparseQuantParam.eP; info[2] = (int)mSparseQuantParam.eP;
info[3] = mIm2ColParamter.strideX; info[3] = mIm2ColParamter.strideX;
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first); auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second); auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) { for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {

View File

@ -309,7 +309,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator(); auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1; auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first); auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
if (nullptr == tempPtr.first) { if (tempPtr.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC); backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
@ -320,8 +320,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
mFunction.second = [=](int tId) { mFunction.second = [=](int tId) {
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId; auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second + auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
auto el = (int32_t *)(srcPtr + kernelSize * maxLine); auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
int32_t info[4]; int32_t info[4];

View File

@ -14,6 +14,7 @@
#include "core/AutoStorage.h" #include "core/AutoStorage.h"
#include "core/Macro.h" #include "core/Macro.h"
#include "core/Concurrency.h" #include "core/Concurrency.h"
#include "core/TensorUtils.hpp"
//#define MNN_OPEN_TIME_TRACE //#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp> #include <MNN/AutoTime.hpp>
#include "math/Vec.hpp" #include "math/Vec.hpp"
@ -28,15 +29,15 @@ public:
mAllocator = allocator; mAllocator = allocator;
} }
~ AutoMemory() { ~ AutoMemory() {
if (nullptr != mContent.first) { if (!mContent.invalid()) {
mAllocator->free(mContent); mAllocator->free(mContent);
} }
} }
const std::pair<void*, int>& get() const { const MemChunk& get() const {
return mContent; return mContent;
} }
private: private:
std::pair<void*, int> mContent; MemChunk mContent;
BufferAllocator* mAllocator; BufferAllocator* mAllocator;
}; };
@ -62,15 +63,15 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes; auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes;
MNN_ASSERT(bExtraStride >= 0); MNN_ASSERT(bExtraStride >= 0);
auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes); auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes);
if (nullptr == tileBufferBasic.first) { if (tileBufferBasic.invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
auto tileHostOrigin = (uint8_t*)tileBufferBasic.first + tileBufferBasic.second;
int unitNumber = e / eP; int unitNumber = e / eP;
int xCount = e - unitNumber * eP; int xCount = e - unitNumber * eP;
auto eReal = aStride / core->bytes / core->pack; auto eReal = aStride / core->bytes / core->pack;
mFunctions.emplace_back( mFunctions.emplace_back(
std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileHostOrigin, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) { std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
auto core = static_cast<CPUBackend*>(backend())->functions(); auto core = static_cast<CPUBackend*>(backend())->functions();
size_t parameters[6]; size_t parameters[6];
parameters[0] = xCount * core->bytes; parameters[0] = xCount * core->bytes;
@ -79,17 +80,17 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
parameters[3] = cStride; parameters[3] = cStride;
parameters[4] = 0; parameters[4] = 0;
parameters[5] = bExtraStride; parameters[5] = bExtraStride;
auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes; auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes;
const float* postParametersPtr = nullptr; const float* postParametersPtr = nullptr;
if (!active.empty()) { if (!active.empty()) {
postParametersPtr = active.data(); postParametersPtr = active.data();
} }
auto aHost = mStack[AT.stackIndex] + AT.offsetBytes; auto aHost = mStack[AT.stackIndex].ptr() + AT.offsetBytes;
auto bHost = mStack[BT.stackIndex] + BT.offsetBytes; auto bHost = mStack[BT.stackIndex].ptr() + BT.offsetBytes;
auto cHost = mStack[CT.stackIndex] + CT.offsetBytes; auto cHost = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
const uint8_t* biasPtr = nullptr; const uint8_t* biasPtr = nullptr;
if (-1 != COT.stackIndex) { if (-1 != COT.stackIndex) {
biasPtr = mStack[COT.stackIndex] + COT.offsetBytes; biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
} }
auto packUnit = core->bytes * core->pack; auto packUnit = core->bytes * core->pack;
int32_t info[4]; int32_t info[4];
@ -166,7 +167,7 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
CTemp.stackIndex = (int)mStack.size(); CTemp.stackIndex = (int)mStack.size();
CTemp.offsetBytes = 0; CTemp.offsetBytes = 0;
CTemp.lineStrideBytes = e * core->bytes * core->pack; CTemp.lineStrideBytes = e * core->bytes * core->pack;
mStack.emplace_back((uint8_t*)CAddr.get().first + CAddr.get().second); mStack.emplace_back(CAddr.get());
MatrixInfo Empty; MatrixInfo Empty;
Empty.stackIndex = -1; Empty.stackIndex = -1;
@ -197,8 +198,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
} }
// Add CTemp to C // Add CTemp to C
auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) { auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) {
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes; auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
auto xAddr = mStack[CTemp.stackIndex] + CTemp.offsetBytes; auto xAddr = mStack[CTemp.stackIndex].ptr() + CTemp.offsetBytes;
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core); MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f1, numberThread)); mFunctions.emplace_back(std::make_pair(f1, numberThread));
@ -206,10 +207,10 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
if (!postParameters.empty() && COT.stackIndex >= 0) { if (!postParameters.empty() && COT.stackIndex >= 0) {
if (1 == numberThread) { if (1 == numberThread) {
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) { auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes); auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
auto width = e; auto width = e;
auto height = cHeight; auto height = cHeight;
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes; auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data()); core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data());
}; };
mFunctions.emplace_back(std::make_pair(postFunction, 1)); mFunctions.emplace_back(std::make_pair(postFunction, 1));
@ -217,8 +218,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) { auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
auto width = e; auto width = e;
auto height = cHeight; auto height = cHeight;
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes; auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes; auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
for (int y = tId; y < height; y+=numberThread) { for (int y = tId; y < height; y+=numberThread) {
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data()); core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
} }
@ -278,19 +279,19 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
auto maxlH = std::max(lSub, hSub); auto maxlH = std::max(lSub, hSub);
AutoMemory YAddr(hSub * lSub * core->bytes, allocator); AutoMemory YAddr(hSub * lSub * core->bytes, allocator);
AutoMemory XAddr(maxlH * eSub * core->bytes, allocator); AutoMemory XAddr(maxlH * eSub * core->bytes, allocator);
if (nullptr == XAddr.get().first || nullptr == YAddr.get().first) { if (XAddr.get().invalid() || YAddr.get().invalid()) {
return OUT_OF_MEMORY; return OUT_OF_MEMORY;
} }
MatrixInfo Y; MatrixInfo Y;
Y.stackIndex = (int)mStack.size(); Y.stackIndex = (int)mStack.size();
mStack.emplace_back((uint8_t*)YAddr.get().first + YAddr.get().second); mStack.emplace_back(YAddr.get());
Y.offsetBytes = 0; Y.offsetBytes = 0;
Y.lineStrideBytes = lSub * core->bytes * hP; Y.lineStrideBytes = lSub * core->bytes * hP;
MatrixInfo X; MatrixInfo X;
X.stackIndex = (int)mStack.size(); X.stackIndex = (int)mStack.size();
X.offsetBytes = 0; X.offsetBytes = 0;
X.lineStrideBytes = eSub * core->bytes * core->pack; X.lineStrideBytes = eSub * core->bytes * core->pack;
mStack.emplace_back((uint8_t*)XAddr.get().first + XAddr.get().second); mStack.emplace_back(XAddr.get());
MatrixInfo CX; MatrixInfo CX;
CX.stackIndex = X.stackIndex; CX.stackIndex = X.stackIndex;
@ -327,12 +328,12 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
{ {
// S3=A11-A21, T3=B22-B12, P7=S3*T3 // S3=A11-A21, T3=B22-B12, P7=S3*T3
auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) { auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) {
auto xAddr = mStack[X.stackIndex] + X.offsetBytes; auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes; auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
auto a11Ptr = mStack[a11.stackIndex] + a11.offsetBytes; auto a11Ptr = mStack[a11.stackIndex].ptr() + a11.offsetBytes;
auto a21Ptr = mStack[a21.stackIndex] + a21.offsetBytes; auto a21Ptr = mStack[a21.stackIndex].ptr() + a21.offsetBytes;
MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core); MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core);
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core); MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f, numberThread)); mFunctions.emplace_back(std::make_pair(f, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {});
@ -343,8 +344,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
{ {
// S1=A21+A22, T1=B12-B11, P5=S1T1 // S1=A21+A22, T1=B12-B11, P5=S1T1
auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) { auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex] + X.offsetBytes, mStack[a21.stackIndex] + a21.offsetBytes, mStack[a22.stackIndex] + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core); MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex].ptr() + X.offsetBytes, mStack[a21.stackIndex].ptr() + a21.offsetBytes, mStack[a22.stackIndex].ptr() + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex] + Y.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, mStack[b11.stackIndex] + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core); MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex].ptr() + Y.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, mStack[b11.stackIndex].ptr() + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f, numberThread)); mFunctions.emplace_back(std::make_pair(f, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {});
@ -355,10 +356,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
{ {
// S2=S1-A11, T2=B22-T1, P6=S2T2 // S2=S1-A11, T2=B22-T1, P6=S2T2
auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) { auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
auto xAddr = mStack[X.stackIndex] + X.offsetBytes; auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes; auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex] + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core); MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex].ptr() + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core); MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f, numberThread)); mFunctions.emplace_back(std::make_pair(f, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {});
@ -369,8 +370,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
{ {
// S4=A12-S2, P3=S4*B22, P1=A11*B11 // S4=A12-S2, P3=S4*B22, P1=A11*B11
auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) { auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) {
auto xAddr = mStack[X.stackIndex] + X.offsetBytes; auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex] + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core); MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex].ptr() + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f, numberThread)); mFunctions.emplace_back(std::make_pair(f, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {});
@ -387,10 +388,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
// U5=U4+P3, T4=T2-B21, P4=A22*T4 // U5=U4+P3, T4=T2-B21, P4=A22*T4
auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) { auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) {
for (int y = tId; y < cHeight; y+=numberThread) { for (int y = tId; y < cHeight; y+=numberThread) {
core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex] + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex] + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex] + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex] + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex] + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1); core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex].ptr() + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex].ptr() + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex].ptr() + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex].ptr() + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex].ptr() + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
} }
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes; auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex] + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core); MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex].ptr() + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f, numberThread)); mFunctions.emplace_back(std::make_pair(f, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {});
@ -402,8 +403,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
// U6=U3-P4, P2=A12*B21, U1=P1+P2 // U6=U3-P4, P2=A12*B21, U1=P1+P2
auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) { auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) {
auto cw = eSub; auto cw = eSub;
auto c21Addr = mStack[c21.stackIndex] + c21.offsetBytes; auto c21Addr = mStack[c21.stackIndex].ptr() + c21.offsetBytes;
MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex] + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core); MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex].ptr() + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f0, numberThread)); mFunctions.emplace_back(std::make_pair(f0, numberThread));
auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {}); auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {});
@ -412,18 +413,18 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
} }
auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) { auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) {
auto cw = eSub; auto cw = eSub;
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes; auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
auto xAddr = mStack[X.stackIndex] + X.offsetBytes; auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core); MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core);
}; };
mFunctions.emplace_back(std::make_pair(f1, numberThread)); mFunctions.emplace_back(std::make_pair(f1, numberThread));
if (!postParameters.empty() && COT.stackIndex >= 0) { if (!postParameters.empty() && COT.stackIndex >= 0) {
if (1 == numberThread) { if (1 == numberThread) {
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) { auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes); auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
auto width = eSub * 2; auto width = eSub * 2;
auto height = cHeight * 2; auto height = cHeight * 2;
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes; auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data()); core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data());
}; };
mFunctions.emplace_back(std::make_pair(postFunction, numberThread)); mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
@ -431,8 +432,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) { auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
auto width = eSub * 2; auto width = eSub * 2;
auto height = cHeight * 2; auto height = cHeight * 2;
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes; auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes; auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
for (int y = tId; y < height; y+=numberThread) { for (int y = tId; y < height; y+=numberThread) {
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data()); core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
} }
@ -496,25 +497,25 @@ ErrorCode StrassenMatrixComputor::onEncode(const std::vector<Tensor*>& inputs, c
core->MNNGetMatMulPackMode(&eP, &lP, &hP); core->MNNGetMatMulPackMode(&eP, &lP, &hP);
int bs = UP_DIV(l, lP) * lP * hP; int bs = UP_DIV(l, lP) * lP * hP;
int cs = C->stride(0); int cs = C->stride(0);
uint8_t* bias = nullptr; MemChunk bias;
bool useBias = false; bool useBias = false;
if (inputs.size() > 2) { if (inputs.size() > 2) {
bias = inputs[2]->host<uint8_t>(); bias = TensorUtils::getDescribe(inputs[2])->mem->chunk();
useBias = true; useBias = true;
} }
return onEncode(e, l, h, as, bs, cs, A->host<uint8_t>(), B->host<uint8_t>(), C->host<uint8_t>(), useBias, bias, postParameters); return onEncode(e, l, h, as, bs, cs, TensorUtils::getDescribe(A)->mem->chunk(), TensorUtils::getDescribe(B)->mem->chunk(), TensorUtils::getDescribe(C)->mem->chunk(), useBias, bias, postParameters);
} }
ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias, const std::vector<float>& postParameters) { ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias, const std::vector<float>& postParameters) {
auto core = static_cast<CPUBackend*>(backend())->functions(); auto core = static_cast<CPUBackend*>(backend())->functions();
MatrixInfo a,b,c,bias; MatrixInfo a,b,c,bias;
bias.stackIndex = -1; bias.stackIndex = -1;
mFunctions.clear(); mFunctions.clear();
mStack = {(uint8_t*)AT, (uint8_t*)BT, CT}; mStack = {AT, BT, CT};
if (useBias) { if (useBias) {
bias.stackIndex = 3; bias.stackIndex = 3;
bias.offsetBytes = 0; bias.offsetBytes = 0;
mStack.emplace_back((uint8_t*)Bias); mStack.emplace_back(Bias);
} }
a.stackIndex = 0; a.stackIndex = 0;
a.lineStrideBytes = as * core->bytes; a.lineStrideBytes = as * core->bytes;

View File

@ -10,6 +10,7 @@
#define StrassenMatmulComputor_hpp #define StrassenMatmulComputor_hpp
#include <functional> #include <functional>
#include "core/BufferAllocator.hpp"
#include "core/Backend.hpp" #include "core/Backend.hpp"
namespace MNN { namespace MNN {
/** /**
@ -53,8 +54,9 @@ public:
*/ */
ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0); ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0);
ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {}); ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias = MemChunk(), const std::vector<float>& postParameters = {});
// ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr); void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr);
void onReset(); void onReset();
@ -79,7 +81,7 @@ private:
Backend* mBackend; Backend* mBackend;
std::vector<uint8_t*> mStack; std::vector<MemChunk> mStack;
}; };
} // namespace MNN } // namespace MNN

View File

@ -124,6 +124,7 @@ void MNNInt8FunctionInit() {
auto core = MNN::MNNGetInt8CoreFunctions(); auto core = MNN::MNNGetInt8CoreFunctions();
core->MNNAvgPoolInt8 = MNNAvgPoolUint8; core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
core->MNNMaxPoolInt8 = MNNMaxPoolInt8_; core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
core->MNNNormInt8 = _SSE_MNNNormInt8;
if (cpuFlags & libyuv::kCpuHasSSE41) { if (cpuFlags & libyuv::kCpuHasSSE41) {
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8; core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat; core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;

View File

@ -75,6 +75,7 @@ void _AVX_WinogradInit(void* functions);
void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters); void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters);
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size); void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP); void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP);
void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap); void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);

View File

@ -754,4 +754,7 @@ void _AVX_MNNInt8FunctionInit(void* functions) {
// conv depthwise // conv depthwise
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit; gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
// Norm
gAVX2CoreInt8Functions->MNNNormInt8 = _AVX_MNNNormInt8;
} }

View File

@ -202,7 +202,7 @@ void _AVX_MNNSoftmax(float* dest, const float* source, size_t size) {
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) { void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
float tmpfloat8[8]; float tmpfloat8[8];
int count = size / 8; int count = static_cast<int32_t>(size / 8);
int remain = count * 8; int remain = count * 8;
// step 1: get sum // step 1: get sum
float sum = 0.f; float sum = 0.f;
@ -263,4 +263,79 @@ void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float
dst[i] = (src[i] - mean) * variable; dst[i] = (src[i] - mean) * variable;
} }
} }
}
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
float tmpfloat8[8];
int count = static_cast<int32_t>(size / 8);
int remain = count * 8;
std::vector<float> inpf(size);
std::vector<float> outf(size);
std::vector<float> inpScale(4, params->inputScale[0]);
std::vector<float> outScale(4, params->outputScale[0]);
float* srcf = inpf.data();
float* dstf = outf.data();
// step 0: Int8 -> Float
_AVX_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
// step 1: get sum
float sum = 0.f;
if (count > 0) {
auto sumVal = _mm256_set1_ps(0.f);
for (int i = 0; i < count; i++) {
sumVal = _mm256_add_ps(sumVal, _mm256_loadu_ps(srcf + i * 8));
}
_mm256_storeu_ps(tmpfloat8, sumVal);
for (int i = 0; i < 8; i++) {
sum += tmpfloat8[i];
}
}
for (int i = remain; i < size; i++) {
sum += srcf[i];
}
// step 2: get square_sum
float mean = sum / size;
float square_sum = 0.f;
auto meanVal = _mm256_set1_ps(mean);
if (count > 0) {
auto sumVal = _mm256_set1_ps(0.f);
for (int i = 0; i < count; i++) {
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
sumVal = _mm256_add_ps(sumVal, _mm256_mul_ps(x, x));
}
_mm256_storeu_ps(tmpfloat8, sumVal);
for (int i = 0; i < 8; i++) {
square_sum += tmpfloat8[i];
}
}
for (int i = remain; i < size; i++) {
float x = (srcf[i] - mean);
square_sum += x * x;
}
// step 3: get result
float variable = square_sum / size;
variable = 1.f / sqrt(variable + epsilon);
auto variableVal = _mm256_set1_ps(variable);
if (gamma && beta) {
for (int i = 0; i < count; i++) {
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
auto g = _mm256_loadu_ps(gamma + i * 8);
auto b = _mm256_loadu_ps(beta + i * 8);
auto y = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(x, g), variableVal), b);
_mm256_storeu_ps(dstf + i * 8, y);
}
for (int i = remain; i < size; i++) {
dstf[i] = (srcf[i] - mean) * gamma[i] * variable + beta[i] ;
}
} else {
for (int i = 0; i < count; i++) {
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
auto y = _mm256_mul_ps(x, variableVal);
_mm256_storeu_ps(dstf + i * 8, y);
}
for (int i = remain; i < size; i++) {
dstf[i] = (srcf[i] - mean) * variable;
}
}
// step 4: Float -> Int8
_AVX_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
} }

View File

@ -79,6 +79,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
void _SSE_ExtraInit(void* functions); void _SSE_ExtraInit(void* functions);
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size); void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
void _SSE_ImageProcessInit(void* functions, int cpuFlags); void _SSE_ImageProcessInit(void* functions, int cpuFlags);
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
/* Image process functions */ /* Image process functions */
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count); void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);

View File

@ -58,7 +58,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) { void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
float tmpfloat4[4]; float tmpfloat4[4];
int count = size / 4; int count = static_cast<int32_t>(size / 4);
int remain = count * 4; int remain = count * 4;
// step 1: get maxValue // step 1: get maxValue
float maxValue = source[0]; float maxValue = source[0];
@ -212,7 +212,7 @@ void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) {
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) { void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
float tmpfloat4[4]; float tmpfloat4[4];
int count = size / 4; int count = static_cast<int32_t>(size / 4);
int remain = count * 4; int remain = count * 4;
// step 1: get sum // step 1: get sum
float sum = 0.f; float sum = 0.f;
@ -270,3 +270,74 @@ void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float
} }
} }
} }
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
float tmpfloat4[4];
int count = static_cast<int32_t>(size / 4);
int remain = count * 4;
float sum = 0.f;
std::vector<float> inpf(size);
std::vector<float> outf(size);
std::vector<float> inpScale(4, params->inputScale[0]);
std::vector<float> outScale(4, params->outputScale[0]);
float* srcf = inpf.data();
float* dstf = outf.data();
// step 0: Int8 -> Float
_SSE_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
// step 1: get sum
if (count > 0) {
auto sumVal = _mm_set1_ps(0.f);
for (int i = 0; i < count; i++) {
sumVal = _mm_add_ps(sumVal, _mm_loadu_ps(srcf + i * 4));
}
_mm_storeu_ps(tmpfloat4, sumVal);
sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
}
for (int i = remain; i < size; i++) {
sum += srcf[i];
}
// step 2: get square_sum
float mean = sum / size;
float square_sum = 0.f;
auto meanVal = _mm_set1_ps(mean);
if (count > 0) {
auto sumVal = _mm_set1_ps(0.f);
for (int i = 0; i < count; i++) {
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
sumVal = _mm_add_ps(sumVal, _mm_mul_ps(x, x));
}
_mm_storeu_ps(tmpfloat4, sumVal);
square_sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
}
for (int i = remain; i < size; i++) {
float x = (srcf[i] - mean);
square_sum += x * x;
}
// step 3: get result
float variable = square_sum / size;
variable = 1.f / sqrt(variable + epsilon);
auto variableVal = _mm_set1_ps(variable);
if (gamma && beta) {
for (int i = 0; i < count; i++) {
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
auto g = _mm_loadu_ps(gamma + i * 4);
auto b = _mm_loadu_ps(beta + i * 4);
auto y = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(x, g), variableVal), b);
_mm_storeu_ps(dstf + i * 4, y);
}
for (int i = remain; i < size; i++) {
dstf[i] = (src[i] - mean) * gamma[i] * variable + beta[i] ;
}
} else {
for (int i = 0; i < count; i++) {
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
auto y = _mm_mul_ps(x, variableVal);
_mm_storeu_ps(dstf + i * 4, y);
}
for (int i = remain; i < size; i++) {
dstf[i] = (srcf[i] - mean) * variable;
}
}
// step 4: Float -> Int8
_SSE_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
}

View File

@ -37,10 +37,10 @@ public:
// Do nothing // Do nothing
} }
virtual ~ CUDARuntimeAllocator() = default; virtual ~ CUDARuntimeAllocator() = default;
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override { virtual MemChunk onAlloc(size_t size, size_t align) override {
return std::make_pair(mRuntime->alloc(size), 0); return MemChunk(mRuntime->alloc(size), 0);
} }
virtual void onRelease(std::pair<void*, size_t> ptr) override { virtual void onRelease(MemChunk ptr) override {
mRuntime->free(ptr.first); mRuntime->free(ptr.first);
} }
private: private:
@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
return; return;
} }
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get())); std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
mBufferPool.reset(new BufferAllocator(allocator)); mBufferPool.reset(new EagerBufferAllocator(allocator));
} }
mDefaultPrecision = precision; mDefaultPrecision = precision;
} }
@ -103,7 +103,7 @@ CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
#ifdef LOG_VERBOSE #ifdef LOG_VERBOSE
MNN_PRINT("cuda backend create\n"); MNN_PRINT("cuda backend create\n");
#endif #endif
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get()))); mBufferPool.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
mStaticBufferPool = st; mStaticBufferPool = st;
mCUDARuntime = rt; mCUDARuntime = rt;
mUseFp16AsFp32 = (precision == 2); mUseFp16AsFp32 = (precision == 2);
@ -139,16 +139,19 @@ int CUDABackend::getPrecision() const {
class CUDAMemObj : public Backend::MemObj { class CUDAMemObj : public Backend::MemObj {
public: public:
CUDAMemObj(BufferAllocator* allocator, std::pair<void*, int> points) { CUDAMemObj(BufferAllocator* allocator, MemChunk points) {
mPoint = std::move(points); mPoint = std::move(points);
mAllocator = allocator; mAllocator = allocator;
} }
virtual ~ CUDAMemObj() { virtual ~ CUDAMemObj() {
mAllocator->free(mPoint); mAllocator->free(mPoint);
} }
MemChunk chunk() override {
return mPoint;
}
private: private:
BufferAllocator* mAllocator; BufferAllocator* mAllocator;
std::pair<void*, int> mPoint; MemChunk mPoint;
}; };
int CUDABackend::getBytes(const Tensor* tensor) const { int CUDABackend::getBytes(const Tensor* tensor) const {
auto bytes = tensor->getType().bytes(); auto bytes = tensor->getType().bytes();
@ -176,7 +179,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
auto bytes = getBytes(nativeTensor); auto bytes = getBytes(nativeTensor);
size_t mallocSize = realSize(nativeTensor) * bytes; size_t mallocSize = realSize(nativeTensor) * bytes;
std::pair<void*, int> buffer; MemChunk buffer;
if (storageType == DYNAMIC_SEPERATE) { if (storageType == DYNAMIC_SEPERATE) {
buffer = mBufferPool->alloc(mallocSize, true); buffer = mBufferPool->alloc(mallocSize, true);
allocator = mBufferPool.get(); allocator = mBufferPool.get();
@ -191,7 +194,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
if(nullptr == buffer.first) { if(nullptr == buffer.first) {
return nullptr; return nullptr;
}; };
auto host = (uint8_t*)buffer.first + buffer.second; auto host = buffer.ptr();
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host; ((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
auto des = TensorUtils::getDescribe(nativeTensor); auto des = TensorUtils::getDescribe(nativeTensor);
des->extra.offset = buffer.second; des->extra.offset = buffer.second;
@ -380,7 +383,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1); auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1);
MNN_ASSERT(srcDevice || dstDevice); MNN_ASSERT(srcDevice || dstDevice);
uint8_t* srcPtr = nullptr; uint8_t* srcPtr = nullptr;
std::pair<void*, int> tempSrcStorage; MemChunk tempSrcStorage;
auto bytes = getBytes(srcTensor); auto bytes = getBytes(srcTensor);
auto type = srcTensor->getType(); auto type = srcTensor->getType();
@ -434,18 +437,18 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
if (!srcDevice) { if (!srcDevice) {
auto cpuSize = srcTensor->size(); auto cpuSize = srcTensor->size();
tempSrcStorage = mStaticBufferPool->alloc(cpuSize); tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second; srcPtr = tempSrcStorage.ptr();
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice, mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
true); true);
} else { } else {
srcPtr = (uint8_t*)srcTensor->deviceId(); srcPtr = (uint8_t*)srcTensor->deviceId();
} }
uint8_t* dstPtr = nullptr; uint8_t* dstPtr = nullptr;
std::pair<void*, int> tempDstStorage; MemChunk tempDstStorage;
if (!dstDevice) { if (!dstDevice) {
auto cpuSize = dstTensor->size(); auto cpuSize = dstTensor->size();
tempDstStorage = mStaticBufferPool->alloc(cpuSize); tempDstStorage = mStaticBufferPool->alloc(cpuSize);
dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second; dstPtr = tempDstStorage.ptr();
} else { } else {
dstPtr = (uint8_t*)dstTensor->deviceId(); dstPtr = (uint8_t*)dstTensor->deviceId();
} }
@ -462,7 +465,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat); // MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
std::unique_ptr<Tensor> wrapTensor; std::unique_ptr<Tensor> wrapTensor;
std::pair<void*, int> wrapSrcStorage; MemChunk wrapSrcStorage;
if (getDataType(srcTensor) != getDataType(dstTensor)) { if (getDataType(srcTensor) != getDataType(dstTensor)) {
auto dimType = Tensor::CAFFE; auto dimType = Tensor::CAFFE;
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) { switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
@ -486,7 +489,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType)); wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor)); wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType()); // MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second); wrapTensor.get()->buffer().device = (uint64_t)(wrapSrcStorage.ptr());
auto dstType = getDataType(dstTensor); auto dstType = getDataType(dstTensor);
if (dstType != DataType_DT_FLOAT) { if (dstType != DataType_DT_FLOAT) {

View File

@ -41,7 +41,7 @@ public:
virtual float onGetMemoryInMB() override; virtual float onGetMemoryInMB() override;
private: private:
std::shared_ptr<BufferAllocator> mBufferPool; std::shared_ptr<EagerBufferAllocator> mBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime; std::shared_ptr<CUDARuntime> mCUDARuntime;
bool mIsCreateError{false}; bool mIsCreateError{false};
BackendConfig::PrecisionMode mDefaultPrecision; BackendConfig::PrecisionMode mDefaultPrecision;

View File

@ -118,9 +118,9 @@ ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const s
if(mSplitKernel) { if(mSplitKernel) {
mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM; mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM;
auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes); auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes);
mTempDataBuffer = (void*)((uint8_t*)buffer_data.first + buffer_data.second); mTempDataBuffer = (void*)(buffer_data.ptr());
auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t)); auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t));
mTempIndexBuffer = (void*)((uint8_t*)buffer_index.first + buffer_index.second); mTempIndexBuffer = (void*)(buffer_index.ptr());
pool->free(buffer_data); pool->free(buffer_data);
pool->free(buffer_index); pool->free(buffer_index);
} }

View File

@ -45,7 +45,7 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
protected: protected:
std::pair<void*, int> mConstBuffer; MemChunk mConstBuffer;
const Op *mOp; const Op *mOp;
int mTotalCount; int mTotalCount;
constBuffer parameters; constBuffer parameters;

View File

@ -155,7 +155,7 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &input
// Alloc temp cuda memory // Alloc temp cuda memory
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool(); auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
std::pair<void*, size_t> buffer_input, buffer_im2col; MemChunk buffer_input, buffer_im2col;
if(mFp16Fp32MixInfer) { if(mFp16Fp32MixInfer) {
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]); buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second); mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);

View File

@ -31,12 +31,23 @@ public:
// Do nothing // Do nothing
} }
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override { virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
mMaxFuseBufferSize = 0;
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(outputs[0]);
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
if (1 == mLoop->commands()->size()) { if (1 == mLoop->commands()->size()) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0); auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op(); auto op = cmd->op();
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) { if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
auto step = cmd->steps()->data(); auto step = cmd->steps()->data();
if (inputs.size() <= 3) { if (inputs.size() <= 3) {
if (cmd->fuse() >= 0) {
// Make Temp output buffer
auto size = cmd->size()->data();
mMaxFuseBufferSize = bytes * size[0] * size[2];
auto buffer = pool->alloc(mMaxFuseBufferSize);
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
pool->free(buffer);
}
auto& unit = mExecutions[0]; auto& unit = mExecutions[0];
int as = 1, bs = 1, cs = 1; int as = 1, bs = 1, cs = 1;
if (step[1] == 0) { if (step[1] == 0) {
@ -77,11 +88,28 @@ public:
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0); auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op(); auto op = cmd->op();
if (OpType_UnaryOp == op->type() && nullptr == op->main()) { if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
if (cmd->fuse() >= 0) {
// Make Temp output buffer
auto size = cmd->size()->data();
mMaxFuseBufferSize = mLoop->loopNumber() * bytes * size[0] * size[1] * size[2];
auto buffer = pool->alloc(mMaxFuseBufferSize);
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
pool->free(buffer);
}
return NO_ERROR; return NO_ERROR;
} }
} }
for (int i=0; i<mLoop->commands()->size(); ++i) { for (int i=0; i<mLoop->commands()->size(); ++i) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i); auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
if (cmd->fuse() >= 0) {
// Make Temp output buffer
auto size = cmd->size()->data();
if (cmd->op()->type() == OpType_MatMul) {
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
} else {
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
}
}
auto op = cmd->op(); auto op = cmd->op();
auto& unit = mExecutions[i]; auto& unit = mExecutions[i];
// Find indice and copy to cpu // Find indice and copy to cpu
@ -141,6 +169,11 @@ public:
continue; continue;
} }
} }
if(mMaxFuseBufferSize > 0) {
auto buffer = pool->alloc(mMaxFuseBufferSize);
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
pool->free(buffer);
}
return NO_ERROR; return NO_ERROR;
} }
@ -161,9 +194,7 @@ public:
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0); auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op(); auto op = cmd->op();
if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
Tensor::InsideDescribe::Region reg; Tensor::InsideDescribe::Region reg;
auto srcView = cmd->view()->GetAs<View>(1); auto srcView = cmd->view()->GetAs<View>(1);
auto dstView = cmd->view()->GetAs<View>(0); auto dstView = cmd->view()->GetAs<View>(0);
@ -187,14 +218,36 @@ public:
if (index1 >= 0) { if (index1 >= 0) {
srcIndice = (int32_t*)originInputs[index1]->deviceId(); srcIndice = (int32_t*)originInputs[index1]->deviceId();
} }
auto src = (uint8_t*)(input->deviceId()) + srcView->offset() * bytes;
auto dstOrigin = (output->deviceId()) + dstView->offset() * bytes;
auto dst = dstOrigin;
if(cmd->fuse() >= 0) {
dst = (uint64_t)mFuseBuffer;
}
BlitWithIndice( BlitWithIndice(
(uint8_t*)(output->deviceId()) + dstView->offset() * bytes, (uint8_t*)dst,
(uint8_t*)(input->deviceId()) + srcView->offset() * bytes, (uint8_t*)src,
dstIndice, srcIndice, index0, index1, dstIndice, srcIndice, index0, index1,
loopNumber, step0, step1, input->elementSize(), loopNumber, step0, step1, input->elementSize(),
reg, bytes, runtime); reg, bytes, runtime);
if(cmd->fuse() >= 0) {
auto opType = cmd->fuse();
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
auto srcStride0 = dstStride;
auto srcStride1 = dstStride;
int32_t tmpSize[3];
::memcpy(tmpSize, cmd->size()->data(), 3 * sizeof(int32_t));
tmpSize[0] *= loopNumber;
auto type = halide_type_of<float>();
if (static_cast<CUDABackend*>(backend())->useFp16()) {
type.bits = 16;
}
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
tmpSize, srcStride0, srcStride1, dstStride, type, runtime, opType);
}
return NO_ERROR; return NO_ERROR;
} }
} }
@ -220,12 +273,28 @@ public:
offset = offset * cmd->steps()->data()[v] + view->offset(); offset = offset * cmd->steps()->data()[v] + view->offset();
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor); mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
} }
auto dstOrigin = mStackPtr[cmd->indexes()->data()[0]];
auto dst = dstOrigin;
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
int fuseOutputStride[3];
if(cmd->fuse() >= 0) {
dst = (uint64_t)mFuseBuffer;
dstStride = fuseOutputStride;
auto cmdSize = cmd->size()->data();
fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
fuseOutputStride[1] = cmdSize[2];
fuseOutputStride[2] = 1;
}
if (OpType_UnaryOp == op->type()) { if (OpType_UnaryOp == op->type()) {
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]]; auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
auto dst = (float*)mStackPtr[cmd->indexes()->data()[0]];
int unaryType = op->main_as_UnaryOp()->opType(); int unaryType = op->main_as_UnaryOp()->opType();
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data(); auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType); UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
continue; continue;
} }
@ -234,13 +303,13 @@ public:
if (3 == size) { if (3 == size) {
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]]; unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]]; unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]]; unit.outputs[0]->buffer().device = dst;
} else { } else {
MNN_ASSERT(4 == size); MNN_ASSERT(4 == size);
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]]; unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]]; unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]]; unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]]; unit.outputs[0]->buffer().device = dst;
} }
unit.exe->onExecute(unit.inputs, unit.outputs); unit.exe->onExecute(unit.inputs, unit.outputs);
continue; continue;
@ -252,16 +321,33 @@ public:
} }
auto src0 = mStackPtr[cmd->indexes()->data()[1]]; auto src0 = mStackPtr[cmd->indexes()->data()[1]];
auto src1 = mStackPtr[cmd->indexes()->data()[2]]; auto src1 = mStackPtr[cmd->indexes()->data()[2]];
auto dst = mStackPtr[cmd->indexes()->data()[0]];
auto opType = op->main_as_BinaryOp()->opType(); auto opType = op->main_as_BinaryOp()->opType();
auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data(); auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data(); auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
// MNN_PRINT("Binary Loop in optype:%d\n", opType); // MNN_PRINT("Binary Loop in optype:%d\n", opType);
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1, BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType); cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
} }
if(cmd->fuse() >= 0) {
auto opType = cmd->fuse();
auto dstOriginStride = cmd->view()->GetAs<View>(0)->stride()->data();
auto type = halide_type_of<float>();
if (static_cast<CUDABackend*>(backend())->useFp16()) {
type.bits = 16;
}
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
int32_t cmdSize[3];
::memcpy(cmdSize, cmd->size()->data(), 3*sizeof(int32_t));
if(OpType_MatMul == op->type()) {
cmdSize[1] = 1;
dstStride = dstOriginStride;
}
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
cmdSize, dstOriginStride, dstStride, dstOriginStride, type, runtime, opType);
}
} }
} }
return NO_ERROR; return NO_ERROR;
@ -274,6 +360,8 @@ private:
std::vector<uint64_t> mStackPtr; std::vector<uint64_t> mStackPtr;
std::map<Tensor*, Tensor*> mIndiceCopy; std::map<Tensor*, Tensor*> mIndiceCopy;
bool mSingleMatMul = false; bool mSingleMatMul = false;
int mMaxFuseBufferSize;
void* mFuseBuffer;
}; };
class LoopCreator : public CUDABackend::Creator { class LoopCreator : public CUDABackend::Creator {
@ -283,6 +371,13 @@ public:
if (op->main_type() != OpParameter_LoopParam) { if (op->main_type() != OpParameter_LoopParam) {
return nullptr; return nullptr;
} }
auto mLoop = op->main_as_LoopParam();
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
if(cmd->fuse() >= 0) {
// TODO: support afterwards
return nullptr;//
}
return new CUDALoop(backend, op->main_as_LoopParam()); return new CUDALoop(backend, op->main_as_LoopParam());
} }
}; };
@ -290,4 +385,4 @@ public:
static CUDACreatorRegister<LoopCreator> __init(OpType_While); static CUDACreatorRegister<LoopCreator> __init(OpType_While);
}; };
}; };

View File

@ -848,21 +848,21 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]); // MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool(); auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
std::pair<void*, size_t> bufferAData, bufferBData; MemChunk bufferAData, bufferBData;
size_t convertBytes = 2; size_t convertBytes = 2;
if(mFp32Infer) { if(mFp32Infer) {
convertBytes = 4; convertBytes = 4;
} }
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) { if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]); bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second); mTempMatA = (void*)bufferAData.ptr();
} else { } else {
mTempMatA = (void *)A->deviceId(); mTempMatA = (void *)A->deviceId();
} }
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) { if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]); bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second); mTempMatB = (void*)bufferBData.ptr();
} else { } else {
mTempMatB = (void *)B->deviceId(); mTempMatB = (void *)B->deviceId();
} }

View File

@ -102,10 +102,10 @@ ErrorCode MultiInputConvDepthWiseExecution::onResize(const std::vector<Tensor *>
// prepare mParams.mFilter and mParams.mBias // prepare mParams.mFilter and mParams.mBias
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool(); auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
std::pair<void*, int> bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half)); auto bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second); mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
std::pair<void*, int> bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half)); auto bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second); mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
pool->free(bufferFilter); pool->free(bufferFilter);

View File

@ -82,19 +82,19 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
elementBytes = 4; elementBytes = 4;
} }
std::pair<void*, int> bufferFilter; MemChunk bufferFilter;
if(mNeedWeightFill) { if(mNeedWeightFill) {
bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]); bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]);
mFilterAddr = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second); mFilterAddr = (void*)(bufferFilter.ptr());
} else { } else {
mFilterAddr = (void*)inputs[1]->deviceId(); mFilterAddr = (void*)inputs[1]->deviceId();
} }
// Copy Bias // Copy Bias
std::pair<void*, int> bufferBias; MemChunk bufferBias;
if(mNeedBiasFill) { if(mNeedBiasFill) {
bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]); bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]);
mBiasAddr = (void*)((uint8_t*)bufferBias.first + bufferBias.second); mBiasAddr = (void*)(bufferBias.ptr());
} else { } else {
mBiasAddr = (void*)inputs[2]->deviceId(); mBiasAddr = (void*)inputs[2]->deviceId();
@ -107,10 +107,10 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0); mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer)); mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer));
std::pair<void*, int> bufferIm2Col; MemChunk bufferIm2Col;
if(mNeedIm2Col) { if(mNeedIm2Col) {
bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]); bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
mIm2ColBuffer = (void*)((uint8_t*)bufferIm2Col.first + bufferIm2Col.second); mIm2ColBuffer = (void*)(bufferIm2Col.ptr());
} }
// free for Reuse // free for Reuse

View File

@ -84,21 +84,21 @@ ErrorCode MultiInputDeconvExecution::onResize(const std::vector<Tensor*> &inputs
// Alloc temp cuda memory // Alloc temp cuda memory
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool(); auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
std::pair<void*, size_t> buffer_input, buffer_im2col; MemChunk buffer_input, buffer_im2col;
if(mFp16Fp32MixInfer) { if(mFp16Fp32MixInfer) {
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]); buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second); mInputBuffer = (void*)buffer_input.ptr();
} else { } else {
mInputBuffer = (void*)input->deviceId(); mInputBuffer = (void*)input->deviceId();
} }
buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]); buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
mIm2ColBuffer = (void*)((uint8_t*)buffer_im2col.first + buffer_im2col.second); mIm2ColBuffer = (void*)buffer_im2col.ptr();
mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]); mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]);
std::pair<void*, int> buffer_filter; MemChunk buffer_filter;
if(mNeedWeightFill) { if(mNeedWeightFill) {
buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]); buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
mFilterAddr = (void*)((uint8_t*)buffer_filter.first + buffer_filter.second); mFilterAddr = (void*)buffer_filter.ptr();
} else { } else {
mFilterAddr = (void*)inputs[1]->deviceId(); mFilterAddr = (void*)inputs[1]->deviceId();
} }

View File

@ -31,7 +31,7 @@ private:
int mCount; int mCount;
int mChannel; int mChannel;
int mArea; int mArea;
std::pair<void*, int> mPreluStorage; MemChunk mPreluStorage;
bool mIsChannelShared = false; bool mIsChannelShared = false;
}; };

View File

@ -203,12 +203,14 @@ UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) { void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
int count = size[0] * size[1] * size[2]; int count = size[0] * size[1] * size[2];
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); // MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d, ptr:%p %p\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2], input, output);
bool isThirdSizeVector = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1); bool isThirdSizeVector = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1);
bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1); bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
bool isFirstSizeVector = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1); bool isFirstSizeVector = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
bool isStrideVector = (srcStride[0] % 2 == 0 || srcStride[0] == 1) && (srcStride[1] % 2 == 0 || srcStride[1] == 1) && (srcStride[2] % 2 == 0 || srcStride[2] == 1) && \
(dstStride[0] % 2 == 0 || dstStride[0] == 1) && (dstStride[1] % 2 == 0 || dstStride[1] == 1) && (dstStride[2] % 2 == 0 || dstStride[2] == 1);
bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector; bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector;
if(count > 16384 && isSizeVector) { if(count > 16384 && isSizeVector && isStrideVector) {
int32_t newSize[3], newSrcStride[3], newDstStride[3]; int32_t newSize[3], newSrcStride[3], newDstStride[3];
newSize[0] = size[0]; newSize[0] = size[0];
newSize[1] = size[1]; newSize[1] = size[1];

View File

@ -32,7 +32,7 @@ private:
int mCount; int mCount;
int mChannel; int mChannel;
int mArea; int mArea;
std::pair<void*, int> mScaleBiasStorage; MemChunk mScaleBiasStorage;
}; };
} // namespace CUDA } // namespace CUDA

View File

@ -31,7 +31,7 @@ private:
Tensor mStorage; Tensor mStorage;
bool mNeedUnpackC4; bool mNeedUnpackC4;
ReduceParam mCpuParam; ReduceParam mCpuParam;
std::pair<void*, int> mParam; MemChunk mParam;
}; };
} // namespace CUDA } // namespace CUDA

View File

@ -235,23 +235,23 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool(); auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) { if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int)); auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second); mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int)); auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second); mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
pool->free(bufferIndices); pool->free(bufferIndices);
pool->free(bufferValues); pool->free(bufferValues);
} else if (static_cast<CUDABackend*>(backend())->useFp16()) { } else if (static_cast<CUDABackend*>(backend())->useFp16()) {
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int)); auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second); mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half)); auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second); mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
pool->free(bufferIndices); pool->free(bufferIndices);
pool->free(bufferValues); pool->free(bufferValues);
} else { } else {
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int)); auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second); mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float)); auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second); mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
pool->free(bufferIndices); pool->free(bufferIndices);
pool->free(bufferValues); pool->free(bufferValues);

View File

@ -41,13 +41,13 @@ protected:
const Op* mOp = nullptr; const Op* mOp = nullptr;
ConvolutionCommon::Im2ColParameter mIm2ColParamter; ConvolutionCommon::Im2ColParameter mIm2ColParamter;
std::pair<void*, int> mGpuIm2ColParam; MemChunk mGpuIm2ColParam;
void* mIm2ColBuffer; void* mIm2ColBuffer;
bool mIsConv1x1S1D1P0 = false; bool mIsConv1x1S1D1P0 = false;
bool mNeedIm2Col = true; bool mNeedIm2Col = true;
std::pair<void*, int> mGpuKernelParam; MemChunk mGpuKernelParam;
bool mIsBlock = false; bool mIsBlock = false;
int mBlockNum = 1; int mBlockNum = 1;

View File

@ -71,13 +71,13 @@ private:
CutlassGemmInfo mGemmInfo; CutlassGemmInfo mGemmInfo;
ConvolutionCommon::Im2ColParameter mIm2ColParamter; ConvolutionCommon::Im2ColParameter mIm2ColParamter;
std::pair<void*, int> mGpuIm2ColParam; MemChunk mGpuIm2ColParam;
void* mIm2ColBuffer; void* mIm2ColBuffer;
bool mIsConv1x1S1D1P0 = false; bool mIsConv1x1S1D1P0 = false;
bool mNeedIm2Col = true; bool mNeedIm2Col = true;
std::pair<void*, int> mGpuKernelParam; MemChunk mGpuKernelParam;
bool mIsBlock = false; bool mIsBlock = false;
int mBlockNum = 1; int mBlockNum = 1;

View File

@ -38,7 +38,7 @@ private:
int mChannel; int mChannel;
int mCount; int mCount;
int mArea; int mArea;
std::pair<void*, int> mScaleStorage; MemChunk mScaleStorage;
}; };
} // namespace CUDA } // namespace CUDA

View File

@ -35,7 +35,7 @@ private:
int mChannel; int mChannel;
int mCount; int mCount;
int mArea; int mArea;
std::pair<void*, int> mScaleStorage; MemChunk mScaleStorage;
}; };
} // namespace CUDA } // namespace CUDA

View File

@ -64,7 +64,7 @@ public:
private: private:
MetalRuntime(void* context); MetalRuntime(void* context);
void* mContext = nullptr; void* mContext = nullptr;
std::shared_ptr<BufferAllocator> mStatic; std::shared_ptr<EagerBufferAllocator> mStatic;
MetalTuneLevel mTuneLevel = Wide; MetalTuneLevel mTuneLevel = Wide;
std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup; std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
@ -76,7 +76,7 @@ private:
}; };
class MetalRuntimeAllocator : public BufferAllocator::Allocator { class MetalRuntimeAllocator : public EagerBufferAllocator::Allocator {
public: public:
class MetalBufferAlloc { class MetalBufferAlloc {
public: public:
@ -95,8 +95,8 @@ public:
// Do nothing // Do nothing
} }
virtual ~ MetalRuntimeAllocator() = default; virtual ~ MetalRuntimeAllocator() = default;
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override; virtual MemChunk onAlloc(size_t size, size_t align) override;
virtual void onRelease(std::pair<void*, size_t> ptr) override; virtual void onRelease(MemChunk ptr) override;
private: private:
id<MTLDevice> mDevice; id<MTLDevice> mDevice;
@ -127,7 +127,7 @@ public:
id<MTLBuffer> getHostBuffer(size_t size) const; id<MTLBuffer> getHostBuffer(size_t size) const;
id<MTLBuffer> getConstBuffer(size_t size) const; id<MTLBuffer> getConstBuffer(size_t size) const;
public: public:
MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime); MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime);
virtual ~MetalBackend(); virtual ~MetalBackend();
const MetalRuntime* runtime() const { const MetalRuntime* runtime() const {
return mRuntime; return mRuntime;
@ -169,10 +169,10 @@ public:
bool isCommandEncoderSet(); bool isCommandEncoderSet();
void setOpEncoder() const; void setOpEncoder() const;
BufferAllocator *getBufferPool() const { EagerBufferAllocator *getBufferPool() const {
return mBufferPool.get(); return mBufferPool.get();
} }
BufferAllocator *getStaticBufferPool() const { EagerBufferAllocator *getStaticBufferPool() const {
return mStaticBufferPool.get(); return mStaticBufferPool.get();
} }
@ -190,8 +190,8 @@ private:
std::vector<std::function<void(void)>> mOpEncoders; std::vector<std::function<void(void)>> mOpEncoders;
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil; mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
std::shared_ptr<BufferAllocator> mBufferPool; std::shared_ptr<EagerBufferAllocator> mBufferPool;
std::shared_ptr<BufferAllocator> mStaticBufferPool; std::shared_ptr<EagerBufferAllocator> mStaticBufferPool;
private: private:
mutable id<MTLBuffer> mHostBuffer = nullptr; mutable id<MTLBuffer> mHostBuffer = nullptr;

View File

@ -50,9 +50,9 @@ void MetalBackend::addCreator(OpType t, Creator *c) {
map->insert(std::make_pair(t, c)); map->insert(std::make_pair(t, c));
} }
MetalBackend::MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) { MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
mRuntime = runtime; mRuntime = runtime;
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(staticMem.get()), 1024)); mBufferPool.reset(new EagerBufferAllocator(EagerBufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
mStaticBufferPool = staticMem; mStaticBufferPool = staticMem;
mShapeH2D = getConstBuffer(4 * sizeof(int)); mShapeH2D = getConstBuffer(4 * sizeof(int));
mShapeD2H = getConstBuffer(4 * sizeof(int)); mShapeD2H = getConstBuffer(4 * sizeof(int));
@ -67,16 +67,19 @@ void *MetalBackend::context() const {
class MetalMemRelease : public Backend::MemObj { class MetalMemRelease : public Backend::MemObj {
public: public:
MetalMemRelease(std::pair<void*, int> buffer, BufferAllocator* allocator) { MetalMemRelease(MemChunk buffer, EagerBufferAllocator* allocator) {
mBuffer = buffer; mBuffer = buffer;
mAllocator = allocator; mAllocator = allocator;
} }
virtual ~ MetalMemRelease() { virtual ~ MetalMemRelease() {
mAllocator->free(mBuffer); mAllocator->free(mBuffer);
} }
MemChunk chunk() override {
return mBuffer;
}
private: private:
std::pair<void*, int> mBuffer; MemChunk mBuffer;
BufferAllocator* mAllocator; EagerBufferAllocator* mAllocator;
}; };
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) { Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
auto tensor = const_cast<Tensor *>(_tensor); auto tensor = const_cast<Tensor *>(_tensor);
@ -115,8 +118,8 @@ Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType stor
} }
// reuse if possible // reuse if possible
std::pair<void*, int> buffer; MemChunk buffer;
BufferAllocator* allocator = nullptr; EagerBufferAllocator* allocator = nullptr;
switch (storageType) { switch (storageType) {
case Backend::STATIC: { case Backend::STATIC: {
buffer = mStaticBufferPool->alloc(size, false); buffer = mStaticBufferPool->alloc(size, false);
@ -656,8 +659,8 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
MetalRuntime::MetalRuntime(void* context) { MetalRuntime::MetalRuntime(void* context) {
mContext = context; mContext = context;
auto ctx = (__bridge MNNMetalContext *)mContext; auto ctx = (__bridge MNNMetalContext *)mContext;
std::shared_ptr<BufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device])); std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
mStatic.reset(new BufferAllocator(allocator)); mStatic.reset(new EagerBufferAllocator(allocator));
mTunedInfo = new TunedInfo; mTunedInfo = new TunedInfo;
} }
@ -859,12 +862,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
return setCache(std::make_pair(buffer, size)); return setCache(std::make_pair(buffer, size));
} }
std::pair<void*, size_t> MetalRuntimeAllocator::onAlloc(size_t size, size_t align) { MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache]; auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer); auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
return std::make_pair((void *)mMetalBufferAlloc, 0); return MemChunk((void *)mMetalBufferAlloc, 0);
} }
void MetalRuntimeAllocator::onRelease(std::pair<void*, size_t> ptr) { void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
delete (MetalBufferAlloc *)ptr.first; delete (MetalBufferAlloc *)ptr.first;
} }

View File

@ -9,6 +9,7 @@
#include "backend/opencl/core/OpenCLBackend.hpp" #include "backend/opencl/core/OpenCLBackend.hpp"
#include "MNN_generated.h" #include "MNN_generated.h"
#include "core/BufferAllocator.hpp"
#include "core/TensorUtils.hpp" #include "core/TensorUtils.hpp"
#include "shape/SizeComputer.hpp" #include "shape/SizeComputer.hpp"
#include <map> #include <map>
@ -907,25 +908,14 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
#ifdef LOG_VERBOSE #ifdef LOG_VERBOSE
MNN_PRINT("Start onCopyBuffer !\n"); MNN_PRINT("Start onCopyBuffer !\n");
#endif #endif
//int8 if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
if(srcTensor->getType().code == halide_type_int && srcTensor->getType().bits == 8){ copyToDevice(srcTensor, dstTensor);
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) { }else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
copyToDeviceInt8(srcTensor, dstTensor); copyFromDevice(srcTensor, dstTensor);
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){ }else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0){
copyFromDeviceInt8(srcTensor, dstTensor); mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
}else{
MNN_PRINT("onCopyBuffer int8 error !!! \n");
}
}else{ }else{
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) { MNN_PRINT("onCopyBuffer float error !!! \n");
copyToDevice(srcTensor, dstTensor);
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
copyFromDevice(srcTensor, dstTensor);
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0){
mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
}else{
MNN_PRINT("onCopyBuffer float error !!! \n");
}
} }
#ifdef LOG_VERBOSE #ifdef LOG_VERBOSE

View File

@ -0,0 +1,150 @@
//
// ArgMaxBufExecution.cpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#include "backend/opencl/execution/buffer/ArgMaxBufExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "backend/opencl/core/OpenCLBackend.hpp"
namespace MNN {
namespace OpenCL {
ArgMaxBufExecution::ArgMaxBufExecution(const std::string &compute, Backend* backend, const int axis) : Execution(backend) {
mBuildOptions.emplace(compute);
mAxis = axis;
// Do nothing
}
ErrorCode ArgMaxBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
auto runtime = openCLBackend->getOpenCLRuntime();
auto input = inputs[0];
auto output = outputs[0];
if(mAxis < 0){
mAxis = input->dimensions() + mAxis;
}
int inside = 1;
int outside = 1;
for(int i = 0; i < mAxis; ++i){
outside *= input->length(i);
}
for(int i = mAxis + 1; i < input->dimensions(); ++i){
inside *= input->length(i);
}
int dim = input->length(mAxis);
std::vector<int> inputShape = tensorShapeFormat(input);
std::vector<int> outputShape = tensorShapeFormat(output);
int batch = inputShape.at(0);
int inputHeight = inputShape.at(1);
int inputWidth = inputShape.at(2);
int inputChannels = inputShape.at(3);
int inputChannelBlocks = (inputChannels + 3) / 4;
int outputBatch = outputShape.at(0);
int outputHeight = outputShape.at(1);
int outputWidth = outputShape.at(2);
int outputChannels = outputShape.at(3);
int outputChannelBlocks = (outputChannels + 3) / 4;
mGlobalWorkSize = {
static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight),
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
};
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
mKernel = runtime->buildKernel("argmax_buf", "argmax_width_buf", mBuildOptions);
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
mKernel = runtime->buildKernel("argmax_buf", "argmax_height_buf", mBuildOptions);
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
if(output->buffer().dimensions == 1){
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_dim1_buf", mBuildOptions);
}else{
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_buf", mBuildOptions);
}
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
mKernel = runtime->buildKernel("argmax_buf", "argmax_batch_buf", mBuildOptions);
}
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
uint32_t idx = 0;
cl_int ret = CL_SUCCESS;
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
ret |= mKernel.setArg(idx++, openCLBuffer(input));
ret |= mKernel.setArg(idx++, openCLBuffer(output));
ret |= mKernel.setArg(idx++, inputWidth);
ret |= mKernel.setArg(idx++, inputHeight);
ret |= mKernel.setArg(idx++, inputChannels);
ret |= mKernel.setArg(idx++, batch);
ret |= mKernel.setArg(idx++, inputChannelBlocks);
ret |= mKernel.setArg(idx++, outputWidth);
ret |= mKernel.setArg(idx++, outputHeight);
ret |= mKernel.setArg(idx++, outputChannels);
ret |= mKernel.setArg(idx++, outputChannelBlocks);
MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
std::string kernelName = "gargmax_buf";
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
return NO_ERROR;
}
ErrorCode ArgMaxBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
#ifdef LOG_VERBOSE
MNN_PRINT("start ArgMaxBufExecution onExecute...");
#endif
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
#ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event;
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us ArgMax\n",costTime);
#else
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime());
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("end ArgMaxBufExecution onExecute...");
#endif
return NO_ERROR;
}
class ArgMaxBufCreator : public OpenCLBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
auto inputDimensionFromat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
if(inputDimensionFromat == MNN_DATA_FORMAT_NC4HW4){
return nullptr;
}
int axis = op->main_as_ArgMax()->axis();
if (op->type() == OpType_ArgMax) {
return new ArgMaxBufExecution("-DARGMAX", backend, axis);
}else{
return new ArgMaxBufExecution("", backend, axis);
}
}
};
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMaxBuf__(OpType_ArgMax, BUFFER);
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMinBuf__(OpType_ArgMin, BUFFER);
} // namespace OpenCL
} // namespace MNN
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,43 @@
//
// ArgMaxBufExecution.hpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifndef ArgMaxBufExecution_hpp
#define ArgMaxBufExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "MNN_generated.h"
#include "backend/opencl/core/OpenCLBackend.hpp"
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
namespace MNN {
namespace OpenCL {
class ArgMaxBufExecution : public Execution {
public:
ArgMaxBufExecution(const std::string &compute, Backend *backend, const int axis);
virtual ~ArgMaxBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
cl::Kernel mKernel;
uint32_t mMaxWorkGroupSize;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalSize = {1, 1, 1};
std::set<std::string> mBuildOptions;
int mAxis;
};
} // namespace OpenCL
} // namespace MNN
#endif /* ArgMaxBufExecution_hpp */
#endif/* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,161 @@
//
// CastBufExecution.cpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#include "backend/opencl/execution/buffer/CastBufExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "backend/opencl/core/OpenCLBackend.hpp"
namespace MNN {
namespace OpenCL {
CastBufExecution::CastBufExecution(const std::string& compute, Backend* backend) : Execution(backend) {
mBuildOptions.emplace(compute);
}
ErrorCode CastBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
Tensor* input = inputs[0];
Tensor* output = outputs[0];
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
auto runtime = openCLBackend->getOpenCLRuntime();
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
if (runtime->isSupportedIntelSubgroup()) {
return SubgrouponResize(inputs, outputs);
}
#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
mKernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions);
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
std::vector<int> inputShape = tensorShapeFormat(input);
std::vector<int> outputShape = tensorShapeFormat(output);
int batch = outputShape.at(0);
int outputHeight = outputShape.at(1);
int outputWidth = outputShape.at(2);
int channels = outputShape.at(3);
int channelBlocks = (channels + 3) / 4;
mGlobalWorkSize = {
static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight),
static_cast<uint32_t>(batch * channelBlocks),
};
uint32_t idx = 0;
cl_int ret = CL_SUCCESS;
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
ret |= mKernel.setArg(idx++, openCLBuffer(input));
ret |= mKernel.setArg(idx++, openCLBuffer(output));
ret |= mKernel.setArg(idx++, outputWidth);
ret |= mKernel.setArg(idx++, outputHeight);
ret |= mKernel.setArg(idx++, channelBlocks);
MNN_CHECK_CL_SUCCESS(ret, "setArg CastBufExecution");
std::string kernelName = "cast_buf";
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
return NO_ERROR;
}
ErrorCode CastBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
#ifdef LOG_VERBOSE
MNN_PRINT("start CastBufExecution onExecute...");
#endif
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
#ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event;
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us Cast\n",costTime);
#else
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime());
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("end CastBufExecution onExecute...");
#endif
return NO_ERROR;
}
static DataType _mapDataType(DataType src) {
if (DataType_DT_BOOL == src) {
return DataType_DT_INT32;
}
if (DataType_DT_INT64 == src) {
return DataType_DT_INT32;
}
if (DataType_DT_DOUBLE == src) {
return DataType_DT_FLOAT;
}
return src;
}
class CastBufCreator : public OpenCLBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
auto cast = op->main_as_CastParam();
// cast param srcT is invalid
// auto srcT = _mapDataType(cast->srcT());
auto dstT = _mapDataType(cast->dstT());
const auto &inputDataType = inputs[0]->getType();
if (inputDataType.bytes() == 4 && cast->dstT() == MNN::DataType_DT_BOOL) {
return new CastBufExecution("-DTO_BOOL", backend);
}
if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
return new CastBufExecution("", backend);
}
MNN_PRINT("Don't support cast form %d, %d to %d\n", inputDataType.code, inputDataType.bits, cast->dstT());
return nullptr;
}
};
OpenCLCreatorRegister<CastBufCreator> __CastBuf__(OpType_Cast, BUFFER);
} // namespace OpenCL
} // namespace MNN
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,42 @@
//
// CastBufExecution.hpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifndef CastBufExecution_hpp
#define CastBufExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "MNN_generated.h"
#include "backend/opencl/core/OpenCLBackend.hpp"
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
namespace MNN {
namespace OpenCL {
class CastBufExecution : public Execution {
public:
CastBufExecution(const std::string &compute, Backend *backend);
virtual ~CastBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
cl::Kernel mKernel;
uint32_t mMaxWorkGroupSize;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalSize = {1, 1, 1};
std::set<std::string> mBuildOptions;
};
} // namespace OpenCL
} // namespace MNN
#endif /* CastBufExecution_hpp */
#endif/* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,110 @@
//
// RangeBufExecution.cpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#include "backend/opencl/execution/buffer/RangeBufExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "backend/opencl/core/OpenCLBackend.hpp"
namespace MNN {
namespace OpenCL {
RangeBufExecution::RangeBufExecution(const std::string &compute, Backend* backend) : Execution(backend) {
mBuildOptions.emplace(compute);
// Do nothing
}
ErrorCode RangeBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
auto runtime = openCLBackend->getOpenCLRuntime();
mKernel = runtime->buildKernel("range_buf", "range_buf", mBuildOptions);
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
int batch = outputShape.at(0);
int outputHeight = outputShape.at(1);
int outputWidth = outputShape.at(2);
int channels = outputShape.at(3);
int channelBlocks = (channels + 3) / 4;
mGlobalWorkSize = {
static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight),
static_cast<uint32_t>(batch * channelBlocks)
};
uint32_t idx = 0;
cl_int ret = CL_SUCCESS;
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
ret |= mKernel.setArg(idx++, outputWidth);
ret |= mKernel.setArg(idx++, outputHeight);
ret |= mKernel.setArg(idx++, channels);
ret |= mKernel.setArg(idx++, channelBlocks);
MNN_CHECK_CL_SUCCESS(ret, "setArg RangeBufExecution");
std::string kernelName = "range_buf";
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
return NO_ERROR;
}
ErrorCode RangeBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
#ifdef LOG_VERBOSE
MNN_PRINT("start RangeBufExecution onExecute...");
#endif
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
#ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event;
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us Range\n",costTime);
#else
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime());
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("end RangeBufExecution onExecute...");
#endif
return NO_ERROR;
}
class RangeBufCreator : public OpenCLBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
auto code = inputs[0]->getType().code;
switch (code) {
case halide_type_int:
return new RangeBufExecution("-DUSE_INT", backend);
case halide_type_float:
return new RangeBufExecution("-DUSE_FLOAT", backend);
default:
return nullptr;
}
}
};
OpenCLCreatorRegister<RangeBufCreator> __RangeBuf__(OpType_Range, BUFFER);
} // namespace OpenCL
} // namespace MNN
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,42 @@
//
// RangeBufExecution.hpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifndef RangeBufExecution_hpp
#define RangeBufExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "MNN_generated.h"
#include "backend/opencl/core/OpenCLBackend.hpp"
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
namespace MNN {
namespace OpenCL {
class RangeBufExecution : public Execution {
public:
RangeBufExecution(const std::string &compute, Backend *backend);
virtual ~RangeBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
cl::Kernel mKernel;
uint32_t mMaxWorkGroupSize;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalSize = {1, 1, 1};
std::set<std::string> mBuildOptions;
};
} // namespace OpenCL
} // namespace MNN
#endif /* RangeBufExecution_hpp */
#endif/* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -20,12 +20,7 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
MNN_PRINT("start ReductionBufExecution init !\n"); MNN_PRINT("start ReductionBufExecution init !\n");
#endif #endif
mOpenCLBackend = static_cast<OpenCLBackend *>(backend); mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
auto reduct = op->main_as_ReductionParam(); mAxis = op->main_as_ReductionParam()->dim()->data()[0];
if (nullptr != reduct->dim()) {
for (int i = 0; i < reduct->dim()->size(); ++i) {
mAxis.push_back(reduct->dim()->data()[i]);
}
}
switch (op->main_as_ReductionParam()->operation()) { switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN: case ReductionType_MEAN:
mReductType = 0; mReductType = 0;
@ -51,44 +46,129 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
#endif #endif
} }
int ReductionBufExecution::getLocalSize(int size, int maxGroupSize){
int local_size = 1;
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
local_size *= 2;
}
return local_size;
}
ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(mAxis.size() == 1);
MNN_ASSERT(mAxis[0] == 1);
auto runtime = mOpenCLBackend->getOpenCLRuntime(); auto openCLBackend = static_cast<OpenCLBackend*>(backend());
auto runtime = openCLBackend->getOpenCLRuntime();
auto input = inputs[0]; auto input = inputs[0];
auto output = outputs[0]; auto output = outputs[0];
std::vector<int> inputShape = tensorShapeFormat(input); if(mAxis < 0){
//N=outside H=axis W=inside C=1 mAxis = input->dimensions() + mAxis;
}
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])}; int inside = 1;
mLocalWorkSize = {1, 1, 1}; int outside = 1;
for(int i = 0; i < mAxis; ++i){
outside *= input->length(i);
}
for(int i = mAxis + 1; i < input->dimensions(); ++i){
inside *= input->length(i);
}
int dim = input->length(mAxis);
int local_size = 0;
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
if(dim >= 16){
mUseLocal = true;
}
std::vector<int> inputShape = tensorShapeFormat(input);
std::vector<int> outputShape = tensorShapeFormat(output);
int batch = inputShape.at(0);
int inputHeight = inputShape.at(1);
int inputWidth = inputShape.at(2);
int inputChannels = inputShape.at(3);
int inputChannelBlocks = (inputChannels + 3) / 4;
int outputBatch = outputShape.at(0);
int outputHeight = outputShape.at(1);
int outputWidth = outputShape.at(2);
int outputChannels = outputShape.at(3);
int outputChannelBlocks = (outputChannels + 3) / 4;
std::set<std::string> buildOption; std::set<std::string> buildOption;
switch (mReductType) { switch (mReductType) {
case 0: case 0:
buildOption.emplace("-DOPERATE(a,b)=(a+b)"); buildOption.emplace("-DOPERATE(a,b)=(a+b)");
buildOption.emplace("-DGET_AVG"); buildOption.emplace("-DGET_AVG");
buildOption.emplace("-DVALUE=0");
break; break;
case 1: case 1:
buildOption.emplace("-DOPERATE(a,b)=max(a,b)"); buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
buildOption.emplace("-DVALUE=-FLT_MAX");
break; break;
case 2: case 2:
buildOption.emplace("-DOPERATE(a,b)=min(a,b)"); buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
buildOption.emplace("-DVALUE=FLT_MAX");
break; break;
case 3: case 3:
buildOption.emplace("-DOPERATE(a,b)=(a*b)"); buildOption.emplace("-DOPERATE(a,b)=(a*b)");
buildOption.emplace("-DVALUE=1");
break; break;
case 4: case 4:
buildOption.emplace("-DOPERATE(a,b)=(a+b)"); buildOption.emplace("-DOPERATE(a,b)=(a+b)");
buildOption.emplace("-DVALUE=0");
break; break;
default: default:
MNN_ASSERT(false); MNN_ASSERT(false);
break; break;
} }
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_buf", buildOption);
mGlobalWorkSize = {
static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight),
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
};
if(mUseLocal){
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
if(output->buffer().dimensions == 1){
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
}else{
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
}
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
local_size = getLocalSize(batch, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
}
mGlobalWorkSize[0] *= local_size;
}else{
buildOption.emplace("-DLOCAL_SIZE=0");
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
if(output->buffer().dimensions == 1){
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
}else{
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
}
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
}
}
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal); //printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
mUnits.resize(1); mUnits.resize(1);
@ -96,14 +176,27 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]); ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]); ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input)); ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input));
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output)); ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output));
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0])); ret |= mReduct1DKernel.setArg(idx++, inputWidth);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1])); ret |= mReduct1DKernel.setArg(idx++, inputHeight);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2])); ret |= mReduct1DKernel.setArg(idx++, inputChannels);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3])); ret |= mReduct1DKernel.setArg(idx++, batch);
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution"); MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");
if(mUseLocal){
mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
}else{
auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
std::string kernelName = "reduct_buf";
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mReduct1DKernel).first;
}
return NO_ERROR; return NO_ERROR;
} }
@ -114,12 +207,12 @@ ErrorCode ReductionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
#ifdef ENABLE_OPENCL_TIME_PROFILER #ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event; cl::Event event;
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime(), &event); mOpenCLBackend->getOpenCLRuntime(), &event);
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime); MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
#else #else
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime()); mOpenCLBackend->getOpenCLRuntime());
#endif #endif
@ -140,33 +233,31 @@ public:
for (int i = 0; i < outputs.size(); ++i) { for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false); TensorUtils::setTensorSupportPack(outputs[i], false);
} }
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
auto openCLBackend = static_cast<OpenCLBackend *>(backend); auto openCLBackend = static_cast<OpenCLBackend *>(backend);
auto reduct = op->main_as_ReductionParam(); auto reduct = op->main_as_ReductionParam();
if (nullptr == reduct->dim()) { if (nullptr == reduct->dim()) {
return NULL; return NULL;
}
if(reduct->dim()->size() != 1) {
return NULL;
}
switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN:
break;
case ReductionType_MAXIMUM:
break;
case ReductionType_MINIMUM:
break;
case ReductionType_PROD:
break;
case ReductionType_SUM:
break;
default:
return NULL;
break;
}
return new ReductionBufExecution(op, backend);
} }
return NULL; if(reduct->dim()->size() != 1) {
return NULL;
}
switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN:
break;
case ReductionType_MAXIMUM:
break;
case ReductionType_MINIMUM:
break;
case ReductionType_PROD:
break;
case ReductionType_SUM:
break;
default:
return NULL;
break;
}
return new ReductionBufExecution(op, backend);
} }
}; };

View File

@ -30,12 +30,13 @@ public:
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private: private:
int getLocalSize(int size, int maxGroupSize);
cl::Kernel mReduct1DKernel; cl::Kernel mReduct1DKernel;
std::string mKernelName; std::string mKernelName;
OpenCLBackend *mOpenCLBackend; OpenCLBackend *mOpenCLBackend;
MNN::DataType mdataType; MNN::DataType mdataType;
int mReductType; int mReductType;
std::vector<int> mAxis; int mAxis;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1}; std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalWorkSize{1, 1, 1}; std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
bool mUseLocal = false; bool mUseLocal = false;

View File

@ -0,0 +1,103 @@
//
// SelectBufExecution.cpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#include "backend/opencl/execution/buffer/SelectBufExecution.hpp"
#include "core/Macro.h"
#include "core/TensorUtils.hpp"
#include "backend/opencl/core/OpenCLBackend.hpp"
namespace MNN {
namespace OpenCL {
SelectBufExecution::SelectBufExecution(Backend* backend) : Execution(backend) {
// Do nothing
}
ErrorCode SelectBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
auto inSize1 = inputs[1]->elementSize();
auto inSize2 = inputs[2]->elementSize();
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
auto runtime = openCLBackend->getOpenCLRuntime();
if(inSize1 == 1)
mBuildOptions.emplace("-DINSIZE1_EUQAL_1");
if(inSize2 == 1)
mBuildOptions.emplace("-DINSIZE2_EUQAL_1");
mKernel = runtime->buildKernel("select_buf", "select_buf", mBuildOptions);
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
int batch = outputShape.at(0);
int outputHeight = outputShape.at(1);
int outputWidth = outputShape.at(2);
int channels = outputShape.at(3);
int channelBlocks = (channels + 3) / 4;
int outSize = batch * channelBlocks * outputWidth * outputHeight * 4;
mGlobalWorkSize = {
static_cast<uint32_t>(outSize),
1
};
uint32_t idx = 0;
cl_int ret = CL_SUCCESS;
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[1]));
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
MNN_CHECK_CL_SUCCESS(ret, "setArg SelectBufExecution");
std::string kernelName = "select_buf";
mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
return NO_ERROR;
}
ErrorCode SelectBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
#ifdef LOG_VERBOSE
MNN_PRINT("start SelectBufExecution onExecute...");
#endif
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
#ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event;
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us Select\n",costTime);
#else
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
mOpenCLBackend->getOpenCLRuntime());
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("end SelectBufExecution onExecute...");
#endif
return NO_ERROR;
}
class SelectBufCreator : public OpenCLBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false);
}
for (int i = 0; i < outputs.size(); ++i) {
TensorUtils::setTensorSupportPack(outputs[i], false);
}
return new SelectBufExecution(backend);
}
};
OpenCLCreatorRegister<SelectBufCreator> __SelectBuf__(OpType_Select, BUFFER);
} // namespace OpenCL
} // namespace MNN
#endif /* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -0,0 +1,42 @@
//
// SelectBufExecution.hpp
// MNN
//
// Created by MNN on 2023/08/11.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_OPENCL_BUFFER_CLOSED
#ifndef SelectBufExecution_hpp
#define SelectBufExecution_hpp
#include "core/Execution.hpp"
#include <vector>
#include "MNN_generated.h"
#include "backend/opencl/core/OpenCLBackend.hpp"
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
namespace MNN {
namespace OpenCL {
class SelectBufExecution : public Execution {
public:
SelectBufExecution(Backend *backend);
virtual ~SelectBufExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
cl::Kernel mKernel;
uint32_t mMaxWorkGroupSize;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalSize = {1, 1, 1};
std::set<std::string> mBuildOptions;
};
} // namespace OpenCL
} // namespace MNN
#endif /* SelectBufExecution_hpp */
#endif/* MNN_OPENCL_BUFFER_CLOSED */

View File

@ -19,7 +19,6 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
: Execution(backend) { : Execution(backend) {
mAxis = axis; mAxis = axis;
mOpenCLBackend = static_cast<OpenCLBackend *>(backend); mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
buildSoftmaxKernel();
} }
bool SoftmaxBufExecution::buildSoftmaxKernel() { bool SoftmaxBufExecution::buildSoftmaxKernel() {
@ -43,10 +42,27 @@ bool SoftmaxBufExecution::buildSoftmaxKernel() {
ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
Tensor *input = inputs[0]; Tensor *input = inputs[0];
Tensor *output = outputs[0]; Tensor *output = outputs[0];
const auto dims = input->buffer().dimensions;
int inside = 1;
int outside = 1;
int channel = 1;
for (int i = 0; i < mAxis; ++i) {
outside *= input->length(i);
}
channel = input->length(mAxis);
for (int i = mAxis + 1; i < dims; ++i) {
inside *= input->length(i);
}
std::vector<int> inputShape = tensorShapeFormat(input); std::vector<int> inputShape = tensorShapeFormat(input);
std::vector<int> outputShape = tensorShapeFormat(output); std::vector<int> outputShape = tensorShapeFormat(output);
const int inputBatch = inputShape.at(0);
const int inputHeight = inputShape.at(1);
const int inputWidth = inputShape.at(2);
const int inputChannels = inputShape.at(3);
const int outputBatch = outputShape.at(0); const int outputBatch = outputShape.at(0);
const int outputHeight = outputShape.at(1); const int outputHeight = outputShape.at(1);
const int outputWidth = outputShape.at(2); const int outputWidth = outputShape.at(2);
@ -54,9 +70,18 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con
const int channelBlocks = UP_DIV(outputChannels, 4); const int channelBlocks = UP_DIV(outputChannels, 4);
const int remainChannels = channelBlocks * 4 - outputChannels; const int remainChannels = channelBlocks * 4 - outputChannels;
if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
mAxis = 1;
}else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
mAxis = 2;
}else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
mAxis = 3;
}
buildSoftmaxKernel();
if (mAxis == 1) { if (mAxis == 1) {
mGlobalWorkSize = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(outputWidth), mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight * outputBatch)}; static_cast<uint32_t>(outputHeight * outputBatch), 1};
int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth}; int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
uint32_t idx = 0; uint32_t idx = 0;
@ -132,10 +157,6 @@ class SoftmaxBufCreator : public OpenCLBackend::Creator {
public: public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override { const MNN::Op *op, Backend *backend) const override {
if(inputs[0]->dimensions() == 3 || outputs[0]->dimensions() == 3){
MNN_PRINT("softmax not support dimensions == 3 \n");
return nullptr;
}
for (int i = 0; i < inputs.size(); ++i) { for (int i = 0; i < inputs.size(); ++i) {
TensorUtils::setTensorSupportPack(inputs[i], false); TensorUtils::setTensorSupportPack(inputs[i], false);
} }

View File

@ -0,0 +1,254 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
int4 index = 0;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputWidth; ++i){
FLOAT4 value = vload4(i, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}
__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
int4 index = 0;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputHeight; ++i){
FLOAT4 value = vload4(i * inputWidth, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}
__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int index = 0;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
#ifdef ARGMAX
FLOAT maxValue = (FLOAT)-FLT_MAX;
#else
FLOAT maxValue = (FLOAT)FLT_MAX;
#endif
FLOAT4 value;
FLOAT *valuePtr = (FLOAT*)&value;
for(int i = 0; i < inputChannelBlock - 1; ++i){
value = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
}
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
output[outputOffset] = (FLOAT)index;
}
__kernel void argmax_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
int index = 0;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
#ifdef ARGMAX
FLOAT maxValue = (FLOAT)-FLT_MAX;
#else
FLOAT maxValue = (FLOAT)FLT_MAX;
#endif
FLOAT4 value;
FLOAT *valuePtr = (FLOAT*)&value;
for(int i = 0; i < inputChannelBlock - 1; ++i){
value = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = i * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
}
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
#ifdef ARGMAX
if(maxValue < valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#else
if(maxValue > valuePtr[j]){
index = (inputChannelBlock - 1) * 4 + j;
maxValue = valuePtr[j];
}
#endif
}
output[outputOffset] = (FLOAT)index;
}
__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int4 index = 0;
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
FLOAT4 maxValue = vload4(0, input + offset);
for(int i = 1; i < inputBatch; ++i){
FLOAT4 value = vload4(i * batchOffset, input + offset);
#ifdef ARGMAX
index = maxValue < value ? (int4)i : index;
maxValue = fmax(maxValue, value);
#else
index = maxValue > value ? (int4)i : index;
maxValue = fmin(maxValue, value);
#endif
}
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
}

View File

@ -0,0 +1,38 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__kernel void cast_buf(GLOBAL_SIZE_3_DIMS
__global FLOAT* input,
__global FLOAT* output,
__private const int width,
__private const int height,
__private const int channelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / channelBlock;
const int channel_idx = batch_channel_idx % channelBlock;
const int inp_offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
#ifdef TO_BOOL
int4 value = convert_int4(vload4(0, input + inp_offset));
value = value == (int4)0 ? (int4)0 : (int4)1;
vstore4(CONVERT_FLOAT4(value), 0, output + inp_offset);
#else
FLOAT4 value = vload4(0, input + inp_offset);
vstore4(value, 0, output + inp_offset);
#endif
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,40 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__kernel void range_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input0,
__global const FLOAT* input2,
__global FLOAT* output,
__private const int width,
__private const int height,
__private const int channel,
__private const int channelBlock
) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / channelBlock;
const int channel_idx = batch_channel_idx % channelBlock;
const int offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
const int channel4 = channel_idx << 2;
int index = (((batch_idx * channel) + channel4) * height + height_idx) * width + width_idx;
int size = height * width;
int4 index4 = (int4)(index, index + size, index + size * 2, index + size * 3);
FLOAT start = input0[0];
FLOAT step = input2[0];
FLOAT4 value = (FLOAT4)start + CONVERT_FLOAT4(index4) * (FLOAT4)step;
vstore4(value, 0, output + offset);
}

View File

@ -11,308 +11,285 @@
#define GLOBAL_SIZE_2_DIMS \ #define GLOBAL_SIZE_2_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
#define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void reduct_width(GLOBAL_SIZE_3_DIMS
__kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int batch, __private const int inputWidth,
__private const int height, __private const int inputHeight,
__private const int width, __private const int inputChannel,
__private const int channel __private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) { ) {
const int batch_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
FLOAT4 sum = 0; DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h)); const int batch_idx = batch_channel_idx / outputChannelBlock;
sum = sum + in; const int channel_idx = batch_channel_idx % outputChannelBlock;
} const int bh = batch_idx*inputHeight+height_idx;
FLOAT* sum_ptr = (FLOAT*)&sum; const int wc = channel_idx*inputWidth;
for(int i = 1; i < channel; ++i){ FLOAT4 out = (FLOAT4)VALUE;
sum.x += sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
}
__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT4 sum = 0;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = sum + in;
}
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x += sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT4 sum = (FLOAT4)-MAXFLOAT;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = max(sum, in);
}
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x = max(sum.x, sum_ptr[i]);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT4 sum = (FLOAT4)MAXFLOAT;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = min(sum, in);
}
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x = min(sum.x, sum_ptr[i]);
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(0);
const int width_idx = get_global_id(1);
FLOAT4 sum = (FLOAT4)1.0;
for (int h = 0; h < height; h++) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
sum = sum * in;
}
FLOAT* sum_ptr = (FLOAT*)&sum;
for(int i = 1; i < channel; ++i){
sum.x *= sum_ptr[i];
}
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
}
__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0); #if LOCAL_SIZE > 0
FLOAT local sum[256]; const int lid = get_local_id(0);
FLOAT4 out = (FLOAT4)0.0; FLOAT4 local sum[LOCAL_SIZE];
const int reduce_num = get_local_size(0); for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
for (int h = idx; h < height; h+=reduce_num) { out = OPERATE(out, in);
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
out = out + in;
} }
FLOAT* out_ptr = (FLOAT*)&out; sum[lid] = out;
for(int i = 1; i < channel; ++i){
out.x += out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){ for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (idx < i) if (lid < i)
sum[idx] = sum[idx] + sum[idx + i]; sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (idx == 0) { out = sum[0];
#else
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0)); for(int i = 0; i < inputWidth; ++i){
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
out = OPERATE(out, in);
} }
#endif
#ifdef GET_AVG
out = out / inputWidth;
#endif
WI_F(output, (int2)(channel_idx, bh), out);
} }
__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
__kernel void reduct_height(GLOBAL_SIZE_3_DIMS
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int batch, __private const int inputWidth,
__private const int height, __private const int inputHeight,
__private const int width, __private const int inputChannel,
__private const int channel __private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) { ) {
const int batch_idx = get_global_id(1); #if LOCAL_SIZE > 0
const int width_idx = get_global_id(2); const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
const int idx = get_local_id(0); DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
FLOAT local sum[256];
FLOAT4 out = (FLOAT4)0.0; const int width_idx = get_group_id(0);
const int reduce_num = get_local_size(0); const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h)); const int bh = batch_idx*inputHeight;
out = out + in; const int wc = channel_idx*inputWidth+width_idx;
const int lid = get_local_id(0);
FLOAT4 local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
out = OPERATE(out, in);
} }
FLOAT* out_ptr = (FLOAT*)&out; sum[lid] = out;
for(int i = 1; i < channel; ++i){
out.x += out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){ for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (idx < i) if (lid < i)
sum[idx] = sum[idx] + sum[idx + i]; sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (idx == 0) { out = sum[0];
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0)); #else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int bh = batch_idx*inputHeight;
const int wc = channel_idx*inputWidth+width_idx;
FLOAT4 out = (FLOAT4)VALUE;
for(int i = 0; i < inputHeight; ++i){
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
out = OPERATE(out, in);
} }
#endif
#ifdef GET_AVG
out = out / inputHeight;
#endif
WI_F(output, (int2)(wc, batch_idx), out);
} }
__kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS __kernel void reduct_channel(GLOBAL_SIZE_3_DIMS
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int batch, __private const int inputWidth,
__private const int height, __private const int inputHeight,
__private const int width, __private const int inputChannel,
__private const int channel __private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) { ) {
const int batch_idx = get_global_id(1); #if LOCAL_SIZE > 0
const int width_idx = get_global_id(2); const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int idx = get_local_id(0); const int batch_idx = get_global_id(2);
FLOAT local sum[256];
FLOAT4 out = (FLOAT4)(-MAXFLOAT);
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
out = max(out, in);
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x = max(out.x, out_ptr[i]);
}
sum[idx] = out.x;
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
const int width_idx = get_group_id(0);
const int bh = batch_idx*inputHeight+height_idx;
const int wc = width_idx;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
const int lid = get_local_id(0);
FLOAT local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
out = OPERATE(out, in);
}
out.x = OPERATE(out.x, out.y);
out.x = OPERATE(out.x, out.z);
out.x = OPERATE(out.x, out.w);
sum[lid] = out.x;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){ for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (idx < i) if (lid < i)
sum[idx] = max(sum[idx], sum[idx + i]); sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (idx == 0) { out.x = sum[0];
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0)); in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
for(int j = 0; j < remain; ++j){
out.x = OPERATE(out.x, inPtr[j]);
} }
#ifdef GET_AVG
out.x = out.x / inputChannel;
#endif
WI_F(output, (int2)(wc, bh), (FLOAT4)(out.x, 0, 0, 0));
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int bh = batch_idx*inputHeight+height_idx;
const int wc = width_idx;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
FLOAT out = (FLOAT)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = 0; i < inputChannelBlock - 1; ++i){
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
for(int j = 0; j < 4; ++j){
out = OPERATE(out, inPtr[j]);
}
}
in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
for(int j = 0; j < remain; ++j){
out = OPERATE(out, inPtr[j]);
}
#ifdef GET_AVG
out = out / inputChannel;
#endif
WI_F(output, (int2)(wc, bh), (FLOAT4)(out, 0, 0, 0));
#endif
} }
__kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS __kernel void reduct_batch(GLOBAL_SIZE_3_DIMS
__read_only image2d_t input, __read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const int batch, __private const int inputWidth,
__private const int height, __private const int inputHeight,
__private const int width, __private const int inputChannel,
__private const int channel __private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) { ) {
const int batch_idx = get_global_id(1); #if LOCAL_SIZE > 0
const int width_idx = get_global_id(2); const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int idx = get_local_id(0); const int channel_idx = get_global_id(2);
FLOAT local sum[256];
FLOAT4 out = (FLOAT4)(MAXFLOAT);
const int reduce_num = get_local_size(0); DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
const int width_idx = get_group_id(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h)); const int bh = height_idx;
out = min(out, in); const int wc = channel_idx*inputWidth+width_idx;
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
const int lid = get_local_id(0);
FLOAT4 local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
out = OPERATE(out, in);
} }
FLOAT* out_ptr = (FLOAT*)&out; sum[lid] = out;
for(int i = 1; i < channel; ++i){
out.x = min(out.x, out_ptr[i]);
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(int i = reduce_num/2; i > 0; i /= 2){ for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (idx < i) if (lid < i)
sum[idx] = min(sum[idx], sum[idx + i]); sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (idx == 0) { out = sum[0];
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0)); #ifdef GET_AVG
} out = out / inputBatch;
} #endif
WI_F(output, (int2)(wc, bh), out);
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int channel_idx = get_global_id(2);
__kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
__read_only image2d_t input,
__write_only image2d_t output,
__private const int batch,
__private const int height,
__private const int width,
__private const int channel
) {
const int batch_idx = get_global_id(1);
const int width_idx = get_global_id(2);
const int idx = get_local_id(0);
FLOAT local sum[256];
FLOAT4 out = (FLOAT4)1.0;
const int reduce_num = get_local_size(0);
for (int h = idx; h < height; h+=reduce_num) {
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
out = out * in;
}
FLOAT* out_ptr = (FLOAT*)&out;
for(int i = 1; i < channel; ++i){
out.x *= out_ptr[i];
}
sum[idx] = out.x;
barrier(CLK_LOCAL_MEM_FENCE); const int bh = height_idx;
for(int i = reduce_num/2; i > 0; i /= 2){ const int wc = channel_idx*inputWidth+width_idx;
if (idx < i) int batchOffset = inputChannelBlock * inputHeight * inputWidth;
sum[idx] = sum[idx] * sum[idx + i]; FLOAT4 out = (FLOAT4)VALUE;
barrier(CLK_LOCAL_MEM_FENCE); for(int i = 0; i < inputBatch; ++i){
} FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
if (idx == 0) { out = OPERATE(out, in);
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
} }
#ifdef GET_AVG
out = out / inputBatch;
#endif
WI_F(output, (int2)(wc, bh), out);
#endif
} }

View File

@ -9,31 +9,363 @@
#define GLOBAL_SIZE_2_DIMS \ #define GLOBAL_SIZE_2_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim0, __private const int global_size_dim1,
__kernel void reduct_buf(GLOBAL_SIZE_2_DIMS #define GLOBAL_SIZE_3_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
return; \
}
__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input, __global const FLOAT* input,
__global FLOAT* output, __global FLOAT* output,
__private const int batch, __private const int inputWidth,
__private const int height, __private const int inputHeight,
__private const int width, __private const int inputChannel,
__private const int channel __private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) { ) {
const int batch_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4; DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
FLOAT4 out = vload4(0, input + inp_offset);
for (int h = 1; h < height; h++) { const int batch_idx = batch_channel_idx / outputChannelBlock;
FLOAT4 in = vload4(0, input + inp_offset + h*width*4); const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
FLOAT4 out = (FLOAT4)VALUE;
#if LOCAL_SIZE > 0
const int lid = get_local_id(0);
FLOAT4 local sum[LOCAL_SIZE];
for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
FLOAT4 in = vload4(i, input + offset);
out = OPERATE(out, in); out = OPERATE(out, in);
} }
FLOAT* out_ptr = (FLOAT*)&out; sum[lid] = out;
for(int c = 1; c < channel; ++c){ barrier(CLK_LOCAL_MEM_FENCE);
out.x = OPERATE(out.x, out_ptr[c]); for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE);
} }
out = sum[0];
#ifdef GET_AVG #else
out.x = out.x / (height * channel); for(int i = 0; i < inputWidth; ++i){
#endif FLOAT4 in = vload4(i, input + offset);
const int out_offset = batch_idx * width + width_idx; out = OPERATE(out, in);
vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output); }
#endif
#ifdef GET_AVG
out = out / inputWidth;
#endif
vstore4(out, 0, output + outputOffset);
}
__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
#if LOCAL_SIZE > 0
const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
const int width_idx = get_group_id(0);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
const int lid = get_local_id(0);
FLOAT4 local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
FLOAT4 in = vload4(i * inputWidth, input + offset);
out = OPERATE(out, in);
}
sum[lid] = out;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE);
}
out = sum[0];
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
const int batch_idx = batch_channel_idx / outputChannelBlock;
const int channel_idx = batch_channel_idx % outputChannelBlock;
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
FLOAT4 out = (FLOAT4)VALUE;
for(int i = 0; i < inputHeight; ++i){
FLOAT4 in = vload4(i * inputWidth, input + offset);
out = OPERATE(out, in);
}
#endif
#ifdef GET_AVG
out = out / inputHeight;
#endif
vstore4(out, 0, output + outputOffset);
}
__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
#if LOCAL_SIZE > 0
const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
const int width_idx = get_group_id(0);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
const int lid = get_local_id(0);
FLOAT local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
in = vload4(i * inputWidth * inputHeight, input + offset);
out = OPERATE(out, in);
}
out.x = OPERATE(out.x, out.y);
out.x = OPERATE(out.x, out.z);
out.x = OPERATE(out.x, out.w);
sum[lid] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE);
}
out.x = sum[0];
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
out.x = OPERATE(out.x, inPtr[j]);
}
#ifdef GET_AVG
out.x = out.x / inputChannel;
#endif
output[outputOffset] = out.x;
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int remain = inputChannel - (inputChannelBlock - 1) * 4;
FLOAT out = (FLOAT)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = 0; i < inputChannelBlock - 1; ++i){
in = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
out = OPERATE(out, inPtr[j]);
}
}
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
out = OPERATE(out, inPtr[j]);
}
#ifdef GET_AVG
out = out / inputChannel;
#endif
output[outputOffset] = out;
#endif
}
__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
#if LOCAL_SIZE > 0
const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
const int width_idx = get_group_id(0);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
int remain = inputChannel - (inputChannelBlock - 1) * 4;
const int lid = get_local_id(0);
FLOAT local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
in = vload4(i * inputWidth * inputHeight, input + offset);
out = OPERATE(out, in);
}
out.x = OPERATE(out.x, out.y);
out.x = OPERATE(out.x, out.z);
out.x = OPERATE(out.x, out.w);
sum[lid] = out.x;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE);
}
out.x = sum[0];
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
out.x = OPERATE(out.x, inPtr[j]);
}
#ifdef GET_AVG
out.x = out.x / inputChannel;
#endif
output[outputOffset] = out.x;
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int batch_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
int remain = inputChannel - (inputChannelBlock - 1) * 4;
FLOAT out = (FLOAT)VALUE;
FLOAT4 in;
FLOAT *inPtr = (FLOAT*)&in;
for(int i = 0; i < inputChannelBlock - 1; ++i){
in = vload4(i * inputWidth * inputHeight, input + offset);
for(int j = 0; j < 4; ++j){
out = OPERATE(out, inPtr[j]);
}
}
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
for(int j = 0; j < remain; ++j){
out = OPERATE(out, inPtr[j]);
}
#ifdef GET_AVG
out = out / inputChannel;
#endif
output[outputOffset] = out;
#endif
}
__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS
__global const FLOAT* input,
__global FLOAT* output,
__private const int inputWidth,
__private const int inputHeight,
__private const int inputChannel,
__private const int inputBatch,
__private const int inputChannelBlock,
__private const int oututWidth,
__private const int outputHeight,
__private const int outputChannel,
__private const int outputChannelBlock
) {
#if LOCAL_SIZE > 0
const int width_local_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
const int width_idx = get_group_id(0);
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
const int lid = get_local_id(0);
FLOAT4 local sum[LOCAL_SIZE];
FLOAT4 out = (FLOAT4)VALUE;
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
FLOAT4 in = vload4(i * batchOffset, input + offset);
out = OPERATE(out, in);
}
sum[lid] = out;
barrier(CLK_LOCAL_MEM_FENCE);
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
if (lid < i)
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
barrier(CLK_LOCAL_MEM_FENCE);
}
out = sum[0];
#ifdef GET_AVG
out = out / inputBatch;
#endif
vstore4(out, 0, output + outputOffset);
#else
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int channel_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
FLOAT4 out = (FLOAT4)VALUE;
for(int i = 0; i < inputBatch; ++i){
FLOAT4 in = vload4(i * batchOffset, input + offset);
out = OPERATE(out, in);
}
#ifdef GET_AVG
out = out / inputBatch;
#endif
vstore4(out, 0, output + outputOffset);
#endif
} }

View File

@ -0,0 +1,36 @@
#ifdef MNN_SUPPORT_FP16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define GLOBAL_SIZE_2_DIMS \
__private const int global_size_dim0, __private const int global_size_dim1,
#define DEAL_NON_UNIFORM_DIM2(input1, input2) \
if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
return; \
}
__kernel void select_buf(GLOBAL_SIZE_2_DIMS
__global const FLOAT* select,
__global const FLOAT* input0,
__global const FLOAT* input1,
__global FLOAT* output
) {
const int idx = get_global_id(0);
const int idy = get_global_id(1);
DEAL_NON_UNIFORM_DIM2(idx, idy);
if ((int)select[idx]) {
#ifdef INSIZE1_EUQAL_1
output[idx] = input0[0];
#else
output[idx] = input0[idx];
#endif
} else {
#ifdef INSIZE2_EUQAL_1
output[idx] = input1[0];
#else
output[idx] = input1[idx];
#endif
}
}

View File

@ -15,90 +15,76 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels, __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
__private const int remain_channels) { __private const int remain_channels, __private const int4 shape // NCHW
) {
const int channel_block_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int batch_height_idx = get_global_id(1);
const int batch_height_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
const int width = global_size_dim1; FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
FLOAT4 input_data;
for (short i = 0; i < shape.y - 1; ++i) {
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
float_max_value = max(float_max_value, input_data);
}
float_max_value.x = max(float_max_value.x, float_max_value.y);
float_max_value.x = max(float_max_value.x, float_max_value.z);
float_max_value.x = max(float_max_value.x, float_max_value.w);
FLOAT float_max_value = -FLT_MAX; input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w , batch_height_idx));
FLOAT4 input_data; if (remain_channels == 0) {
for (short i = 0; i < global_size_dim0 - 1; ++i) { float_max_value.x = max(float_max_value.x, input_data.x);
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx)); float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value = max(float_max_value, input_data.x); float_max_value.x = max(float_max_value.x, input_data.z);
float_max_value = max(float_max_value, input_data.y); float_max_value.x = max(float_max_value.x, input_data.w);
float_max_value = max(float_max_value, input_data.z); } else if (remain_channels == 1) {
float_max_value = max(float_max_value, input_data.w); float_max_value.x = max(float_max_value.x, input_data.z);
float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value.x = max(float_max_value.x, input_data.x);
} else if (remain_channels == 2) {
float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value.x = max(float_max_value.x, input_data.x);
} else if (remain_channels == 3) {
float_max_value.x = max(float_max_value.x, input_data.x);
}
FLOAT4 accum_result = 0;
for (short i = 0; i < shape.y - 1; ++i) {
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
input_data = EXP(input_data - float_max_value.x);
accum_result += input_data;
}
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w, batch_height_idx));
input_data -= float_max_value.x;
if (remain_channels == 0) {
accum_result.x += EXP(input_data.w);
accum_result.x += EXP(input_data.z);
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 1) {
accum_result.x += EXP(input_data.z);
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 2) {
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 3) {
accum_result.x += EXP(input_data.x);
}
for(int i = 0; i < shape.y; ++i){
int cur_out_width_pos = mad24(i, shape.w, width_idx);
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value.x;
input_data = EXP(input_data) / accum_result.x;
WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
}
} }
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1 , batch_height_idx));
if (remain_channels == 0) {
float_max_value = max(float_max_value, input_data.w);
float_max_value = max(float_max_value, input_data.z);
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 1) {
float_max_value = max(float_max_value, input_data.z);
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 2) {
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 3) {
float_max_value = max(float_max_value, input_data.x);
}
FLOAT accum_result = 0;
for (short i = 0; i < global_size_dim0 - 1; ++i) {
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
input_data = EXP(input_data - float_max_value);
accum_result += input_data.x;
accum_result += input_data.y;
accum_result += input_data.z;
accum_result += input_data.w;
}
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1, batch_height_idx));
input_data -= float_max_value;
if (remain_channels == 0) {
accum_result += EXP(input_data.w);
accum_result += EXP(input_data.z);
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 1) {
accum_result += EXP(input_data.z);
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 2) {
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 3) {
accum_result += EXP(input_data.x);
}
int cur_out_width_pos = mad24(channel_block_idx, global_size_dim1, width_idx);
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value;
const int output_remain = output_channels - mul24(channel_block_idx, 4);
if (output_remain == 1) {
input_data.x = EXP(input_data.x) / accum_result;
} else if (output_remain == 2) {
input_data.y = EXP(input_data.y) / accum_result;
input_data.x = EXP(input_data.x) / accum_result;
} else if (output_remain == 3) {
input_data.z = EXP(input_data.z) / accum_result;
input_data.y = EXP(input_data.y) / accum_result;
input_data.x = EXP(input_data.x) / accum_result;
} else{
input_data = EXP(input_data) / accum_result;
}
WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
} }
__kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output, __kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,

View File

@ -19,87 +19,74 @@ __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
__private const int remain_channels, __private const int remain_channels,
__private const int4 shape) {//NCHW __private const int4 shape) {//NCHW
const int channel_block_idx = get_global_id(0); const int width_idx = get_global_id(0);
const int width_idx = get_global_id(1); const int batch_height_idx = get_global_id(1);
const int batch_height_idx = get_global_id(2);
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx); if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
const int batch_idx = batch_height_idx / shape.z; const int batch_idx = batch_height_idx / shape.z;
const int height_idx = batch_height_idx % shape.z; const int height_idx = batch_height_idx % shape.z;
const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4; const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;
FLOAT float_max_value = -FLT_MAX; FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
FLOAT4 input_data; FLOAT4 input_data;
for (short i = 0; i < global_size_dim0 - 1; ++i) { for (short i = 0; i < shape.y - 1; ++i) {
input_data = vload4(i*shape.z*shape.w, input+offset); input_data = vload4(i*shape.z*shape.w, input+offset);
float_max_value = max(float_max_value, input_data.x); float_max_value = max(float_max_value, input_data);
float_max_value = max(float_max_value, input_data.y); }
float_max_value = max(float_max_value, input_data.z);
float_max_value = max(float_max_value, input_data.w); float_max_value.x = max(float_max_value.x, float_max_value.y);
float_max_value.x = max(float_max_value.x, float_max_value.z);
float_max_value.x = max(float_max_value.x, float_max_value.w);
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
if (remain_channels == 0) {
float_max_value.x = max(float_max_value.x, input_data.x);
float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value.x = max(float_max_value.x, input_data.z);
float_max_value.x = max(float_max_value.x, input_data.w);
} else if (remain_channels == 1) {
float_max_value.x = max(float_max_value.x, input_data.z);
float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value.x = max(float_max_value.x, input_data.x);
} else if (remain_channels == 2) {
float_max_value.x = max(float_max_value.x, input_data.y);
float_max_value.x = max(float_max_value.x, input_data.x);
} else if (remain_channels == 3) {
float_max_value.x = max(float_max_value.x, input_data.x);
}
FLOAT4 accum_result = 0;
for (short i = 0; i < shape.y - 1; ++i) {
input_data = vload4(i*shape.z*shape.w, input+offset);;
input_data = EXP(input_data - float_max_value.x);
accum_result += input_data;
}
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
input_data -= float_max_value.x;
if (remain_channels == 0) {
accum_result.x += EXP(input_data.w);
accum_result.x += EXP(input_data.z);
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 1) {
accum_result.x += EXP(input_data.z);
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 2) {
accum_result.x += EXP(input_data.y);
accum_result.x += EXP(input_data.x);
} else if (remain_channels == 3) {
accum_result.x += EXP(input_data.x);
}
for(int i = 0; i < shape.y; ++i){
input_data = vload4(i*shape.z*shape.w, input+offset) - float_max_value.x;
input_data = EXP(input_data) / accum_result.x;
vstore4(input_data, i*shape.z*shape.w, output+offset);
}
} }
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
if (remain_channels == 0) {
float_max_value = max(float_max_value, input_data.w);
float_max_value = max(float_max_value, input_data.z);
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 1) {
float_max_value = max(float_max_value, input_data.z);
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 2) {
float_max_value = max(float_max_value, input_data.y);
float_max_value = max(float_max_value, input_data.x);
} else if (remain_channels == 3) {
float_max_value = max(float_max_value, input_data.x);
}
FLOAT accum_result = 0;
for (short i = 0; i < global_size_dim0 - 1; ++i) {
input_data = vload4(i*shape.z*shape.w, input+offset);;
input_data = EXP(input_data - float_max_value);
accum_result += input_data.x;
accum_result += input_data.y;
accum_result += input_data.z;
accum_result += input_data.w;
}
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
input_data -= float_max_value;
if (remain_channels == 0) {
accum_result += EXP(input_data.w);
accum_result += EXP(input_data.z);
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 1) {
accum_result += EXP(input_data.z);
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 2) {
accum_result += EXP(input_data.y);
accum_result += EXP(input_data.x);
} else if (remain_channels == 3) {
accum_result += EXP(input_data.x);
}
input_data = vload4(channel_block_idx*shape.z*shape.w, input+offset) - float_max_value;
const int output_remain = output_channels - mul24(channel_block_idx, 4);
if (output_remain == 1) {
input_data.x = EXP(input_data.x) / accum_result;
} else if (output_remain == 2) {
input_data.y = EXP(input_data.y) / accum_result;
input_data.x = EXP(input_data.x) / accum_result;
} else if (output_remain == 3) {
input_data.z = EXP(input_data.z) / accum_result;
input_data.y = EXP(input_data.y) / accum_result;
input_data.x = EXP(input_data.x) / accum_result;
} else{
input_data = EXP(input_data) / accum_result;
}
vstore4(input_data, channel_block_idx*shape.z*shape.w, output+offset);
} }

View File

@ -18,12 +18,7 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
MNN_PRINT("start ReductionExecution init !\n"); MNN_PRINT("start ReductionExecution init !\n");
#endif #endif
mOpenCLBackend = static_cast<OpenCLBackend *>(backend); mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
auto reduct = op->main_as_ReductionParam(); mAxis = op->main_as_ReductionParam()->dim()->data()[0];
if (nullptr != reduct->dim()) {
for (int i = 0; i < reduct->dim()->size(); ++i) {
mAxis.push_back(reduct->dim()->data()[i]);
}
}
switch (op->main_as_ReductionParam()->operation()) { switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN: case ReductionType_MEAN:
mReductType = 0; mReductType = 0;
@ -49,110 +44,150 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
#endif #endif
} }
int ReductionExecution::getLocalSize(int size, int maxGroupSize){
int local_size = 1;
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
local_size *= 2;
}
return local_size;
}
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) { ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(mAxis.size() == 1);
MNN_ASSERT(mAxis[0] == 1);
auto runtime = mOpenCLBackend->getOpenCLRuntime(); auto runtime = mOpenCLBackend->getOpenCLRuntime();
startRecord(runtime, mRecording); startRecord(runtime, mRecording);
auto input = inputs[0]; auto input = inputs[0];
auto output = outputs[0]; auto output = outputs[0];
std::vector<int> inputShape = tensorShapeFormat(input); if(mAxis < 0){
//N=outside H=axis W=inside C=1 mAxis = input->dimensions() + mAxis;
MNN_ASSERT(inputShape[3] == 1); }
if(inputShape[1] >= 256) { int inside = 1;
int outside = 1;
for(int i = 0; i < mAxis; ++i){
outside *= input->length(i);
}
for(int i = mAxis + 1; i < input->dimensions(); ++i){
inside *= input->length(i);
}
int dim = input->length(mAxis);
int local_size = 0;
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
if(dim >= 16){
mUseLocal = true; mUseLocal = true;
} }
if(!mUseLocal) {
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])}; std::vector<int> inputShape = tensorShapeFormat(input);
mLocalWorkSize = {1, 1, 1}; std::vector<int> outputShape = tensorShapeFormat(output);
switch (mReductType) { int batch = inputShape.at(0);
case 0: int inputHeight = inputShape.at(1);
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean", {}); int inputWidth = inputShape.at(2);
break; int inputChannels = inputShape.at(3);
case 1: int inputChannelBlocks = (inputChannels + 3) / 4;
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max", {}); int outputBatch = outputShape.at(0);
break; int outputHeight = outputShape.at(1);
case 2: int outputWidth = outputShape.at(2);
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min", {}); int outputChannels = outputShape.at(3);
break; int outputChannelBlocks = (outputChannels + 3) / 4;
case 3:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul", {}); std::set<std::string> buildOption;
break; switch (mReductType) {
case 4: case 0:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum", {}); buildOption.emplace("-DOPERATE(a,b)=(a+b)");
break; buildOption.emplace("-DGET_AVG");
default: buildOption.emplace("-DVALUE=0");
MNN_ASSERT(false); break;
break; case 1:
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
buildOption.emplace("-DVALUE=-FLT_MAX");
break;
case 2:
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
buildOption.emplace("-DVALUE=FLT_MAX");
break;
case 3:
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
buildOption.emplace("-DVALUE=1");
break;
case 4:
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
buildOption.emplace("-DVALUE=0");
break;
default:
MNN_ASSERT(false);
break;
}
mGlobalWorkSize = {
static_cast<uint32_t>(outputWidth),
static_cast<uint32_t>(outputHeight),
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
};
if(mUseLocal){
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
local_size = getLocalSize(batch, MaxWorkItems[0]);
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
} }
} else { //useLocal mGlobalWorkSize[0] *= local_size;
uint32_t global_x = 8; }else{
int size = inputShape[1]; buildOption.emplace("-DLOCAL_SIZE=0");
if (size >= 1024) { if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
global_x = 256; mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
} else if(size >= 512) { }else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
global_x = 128; mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
} else if (size >= 256) { }else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
global_x = 64; mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
} else if (size >= 128) { mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
global_x = 32; }else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
} else if (size >= 64) { mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
global_x = 16;
} else if (size >= 32) {
global_x = 8;
}
mGlobalWorkSize = {global_x, static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
mLocalWorkSize = {global_x, 1, 1 };
switch (mReductType) {
case 0:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean_local", {});
break;
case 1:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max_local", {});
break;
case 2:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min_local", {});
break;
case 3:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul_local", {});
break;
case 4:
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum_local", {});
break;
default:
MNN_ASSERT(false);
break;
} }
} }
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
mUnits.resize(1); mUnits.resize(1);
uint32_t idx = 0; uint32_t idx = 0;
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
if(mUseLocal) { ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]); ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]); ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
} else {
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
}
ret |= mReduct1DKernel.setArg(idx++, openCLImage(input)); ret |= mReduct1DKernel.setArg(idx++, openCLImage(input));
ret |= mReduct1DKernel.setArg(idx++, openCLImage(output)); ret |= mReduct1DKernel.setArg(idx++, openCLImage(output));
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0])); ret |= mReduct1DKernel.setArg(idx++, inputWidth);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1])); ret |= mReduct1DKernel.setArg(idx++, inputHeight);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2])); ret |= mReduct1DKernel.setArg(idx++, inputChannels);
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3])); ret |= mReduct1DKernel.setArg(idx++, batch);
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution"); MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution");
if(mUseLocal){ if(mUseLocal){
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
}else{ }else{
recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime()); auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
std::string kernelName = "reduct";
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, runtime, kernelName, mReduct1DKernel).first;
} }
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
endRecord(runtime, mRecording); endRecord(runtime, mRecording);
return NO_ERROR; return NO_ERROR;
} }
@ -164,13 +199,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
#ifdef ENABLE_OPENCL_TIME_PROFILER #ifdef ENABLE_OPENCL_TIME_PROFILER
cl::Event event; cl::Event event;
if(mUseLocal) { run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
} else {
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime(), &event);
}
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event); int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime); MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
#else #else
@ -182,13 +211,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
#endif #endif
return NO_ERROR; return NO_ERROR;
} }
if(mUseLocal) { run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime());
} else {
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
mOpenCLBackend->getOpenCLRuntime());
}
#endif #endif
#ifdef LOG_VERBOSE #ifdef LOG_VERBOSE
@ -202,32 +225,36 @@ public:
virtual ~ReductionCreator() = default; virtual ~ReductionCreator() = default;
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const override { const MNN::Op *op, Backend *backend) const override {
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) { auto openCLBackend = static_cast<OpenCLBackend *>(backend);
auto openCLBackend = static_cast<OpenCLBackend *>(backend); auto reduct = op->main_as_ReductionParam();
auto reduct = op->main_as_ReductionParam(); if (nullptr == reduct->dim()) {
if (nullptr == reduct->dim()) { return NULL;
return NULL;
}
if(reduct->dim()->size() != 1) {
return NULL;
}
switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN:
break;
case ReductionType_MAXIMUM:
break;
case ReductionType_MINIMUM:
break;
case ReductionType_PROD:
break;
case ReductionType_SUM:
break;
default:
return NULL;
break;
}
return new ReductionExecution(op, backend);
} }
if(reduct->dim()->size() != 1) {
return NULL;
}
auto axis = reduct->dim()->data()[0];
int dim = inputs[0]->length(axis);
std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
if(dim == inputShape.at(3) && outputs[0]->buffer().dimensions == 1){
return NULL;
}
switch (op->main_as_ReductionParam()->operation()) {
case ReductionType_MEAN:
break;
case ReductionType_MAXIMUM:
break;
case ReductionType_MINIMUM:
break;
case ReductionType_PROD:
break;
case ReductionType_SUM:
break;
default:
return NULL;
break;
}
return new ReductionExecution(op, backend);
return NULL; return NULL;
} }
}; };

View File

@ -28,11 +28,12 @@ public:
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override; virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private: private:
int getLocalSize(int size, int maxGroupSize);
cl::Kernel mReduct1DKernel; cl::Kernel mReduct1DKernel;
OpenCLBackend *mOpenCLBackend; OpenCLBackend *mOpenCLBackend;
MNN::DataType mdataType; MNN::DataType mdataType;
int mReductType; int mReductType;
std::vector<int> mAxis; int mAxis;
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1}; std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
std::vector<uint32_t> mLocalWorkSize{1, 1, 1}; std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
bool mUseLocal = false; bool mUseLocal = false;

Some files were not shown because too many files have changed in this diff Show More