mirror of https://github.com/alibaba/MNN.git
Merge pull request #2580 from alibaba/feature/sync
[MNN:Sync] Sync Internal 2.7.0
This commit is contained in:
commit
9e3cc72952
|
@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
|
|||
else()
|
||||
endif()
|
||||
if (NOT MNN_BUILD_SHARED_LIBS)
|
||||
if(APPLE)
|
||||
set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
|
||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
# Static-link will not replace thread-related weak symbol in glibc with strong symbol
|
||||
# in pthread library, so we need use --whole-archive to pthread
|
||||
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
|
||||
|
|
|
@ -473,16 +473,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
|||
break;
|
||||
case UnaryOpOperation_LOG1P:
|
||||
if(mVectorize) {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
|
||||
ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
|
||||
} else {
|
||||
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
|
||||
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
|
||||
if(mPrecision != BackendConfig::Precision_Low) {
|
||||
ss << ";\n";
|
||||
ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
|
||||
ss << inpName << ".w=(log(1.0+" << operand << ".w))";
|
||||
}
|
||||
} else {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << "=(log((half)1.0+" << operand << "))";
|
||||
} else {
|
||||
ss << inpName << "=(log(1.0+" << operand << "))";
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UnaryOpOperation_FLOOR:
|
||||
if(mVectorize) {
|
||||
|
@ -512,16 +519,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
|||
break;
|
||||
case UnaryOpOperation_SIGMOID:
|
||||
if(mVectorize) {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
|
||||
ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
|
||||
} else {
|
||||
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
|
||||
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
|
||||
if(mPrecision != BackendConfig::Precision_Low) {
|
||||
ss << ";\n";
|
||||
ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
|
||||
ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
|
||||
}
|
||||
} else {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
|
||||
} else {
|
||||
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UnaryOpOperation_TANH:
|
||||
if(mVectorize) {
|
||||
|
@ -538,16 +552,23 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
|||
break;
|
||||
case UnaryOpOperation_RECIPROCAL:
|
||||
if(mVectorize) {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
|
||||
ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
|
||||
} else {
|
||||
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
|
||||
ss << inpName << ".y=(1.0/" << operand << ".y)";
|
||||
if(mPrecision != BackendConfig::Precision_Low) {
|
||||
ss << ";\n";
|
||||
ss << inpName << ".z=(1.0/" << operand << ".z);\n";
|
||||
ss << inpName << ".w=(1.0/" << operand << ".w)";
|
||||
}
|
||||
} else {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << "=(half)(1.0/(float)" << operand << ")";
|
||||
} else {
|
||||
ss << inpName << "=(1.0/" << operand << ")";
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UnaryOpOperation_LOG:
|
||||
if(mVectorize) {
|
||||
|
@ -564,15 +585,42 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
|||
break;
|
||||
case UnaryOpOperation_GELU:
|
||||
if(mVectorize) {
|
||||
ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
|
||||
ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
|
||||
if(mPrecision != BackendConfig::Precision_Low) {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".x*(float)" << operand << ".x*(float)" << operand << ".x+(float)" << operand + ".x))) * (float)" << operand << ".x* 0.5f);\n";
|
||||
ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".y*(float)" << operand << ".y*(float)" << operand << ".y+(float)" << operand + ".y))) * (float)" << operand << ".y* 0.5f)";
|
||||
} else {
|
||||
ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
|
||||
ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
|
||||
ss << ";\n";
|
||||
ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
|
||||
ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
|
||||
ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
|
||||
ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
|
||||
}
|
||||
} else {
|
||||
ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << "*(float)" << operand << "*(float)" << operand << "+(float)" << operand + "))) * (float)" << operand << "* 0.5f)";
|
||||
} else {
|
||||
ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
|
||||
}
|
||||
}
|
||||
break;
|
||||
case UnaryOpOperation_GELU_STANDARD:
|
||||
if(mVectorize) {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
|
||||
ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
|
||||
} else {
|
||||
ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
|
||||
ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
|
||||
ss << ";\n";
|
||||
ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
|
||||
ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
|
||||
}
|
||||
} else {
|
||||
if(mPrecision == BackendConfig::Precision_Low) {
|
||||
ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
|
||||
} else {
|
||||
ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
|
|
@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
|
|||
for (int i = 0; i < 3; ++i) {
|
||||
outputs = module->onForward(inputs);
|
||||
}
|
||||
globalExecutor->resetProfile();
|
||||
outputs = module->onForward(inputs);
|
||||
globalExecutor->dumpProfile();
|
||||
{
|
||||
MNN::Timer autoTime;
|
||||
globalExecutor->resetProfile();
|
||||
for (int i = 0; i < benchTime; ++i) {
|
||||
MNN::AutoTime _t(0, "Once time");
|
||||
// std::cout << i << std::endl;
|
||||
|
|
|
@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
|
|||
for (int i = 0; i < 2; ++i) {
|
||||
{
|
||||
AUTOTIME;
|
||||
Executor::getGlobalExecutor()->resetProfile();
|
||||
outputs = model->onForward({first, second});
|
||||
Executor::getGlobalExecutor()->dumpProfile();
|
||||
}
|
||||
std::ostringstream fileNameOs;
|
||||
std::ostringstream dimInfo;
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
- warm_up_count: 预热次数
|
||||
- forwardtype: 可选,默认是0,即CPU,forwardtype有0->CPU,1->Metal,3->OpenCL,6->OpenGL,7->Vulkan
|
||||
- numberThread: 可选,默认是4,为 CPU 线程数或者 GPU 的运行模式
|
||||
- precision: 可选,默认是 2 (precision_low)
|
||||
- precision: 可选,默认是2,有效输入为:0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
|
||||
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
|
||||
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
|
||||
- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试
|
||||
|
|
|
@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
|
|||
### 参数
|
||||
`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
|
||||
- `model:str` 模型文件路径
|
||||
- `dir:str` 输入输出信息文件夹,可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成,参考模型转换的正确性校验部分。
|
||||
- `dir:str` 输入输出信息文件夹,可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成,参考模型转换的正确性校验部分。
|
||||
- `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析
|
||||
- `forwardType:int` 执行推理的计算设备,有效值为:0(CPU)、1(Metal)、2(CUDA)、3(OpenCL)、6(OpenGL),7(Vulkan) ,9 (TensorRT),可选,默认为`0`
|
||||
- `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试
|
||||
|
@ -456,49 +456,3 @@ Matrix:
|
|||
0.0000000 0.0000000 1.0000000
|
||||
```
|
||||
|
||||
## winogradGenerateCL.out
|
||||
### 说明
|
||||
生成winograd变换矩阵程序,并生成opencl转换代码
|
||||
### 参数
|
||||
`./winogradExample.out unit kernelSize`
|
||||
- `unit:int` 分块大小
|
||||
- `kernelSize:int` 卷积核大小
|
||||
### 示例
|
||||
```bash
|
||||
$ ./winogradGenerateCL.out 2 2
|
||||
A
|
||||
1.0000000 0.0000000
|
||||
1.0000000 0.5000000
|
||||
0.0000000 1.0000000
|
||||
B
|
||||
1.0000000 0.0000000 -0.0000000
|
||||
-2.0000000 2.0000000 -0.5000000
|
||||
0.0000000 0.0000000 1.0000000
|
||||
G
|
||||
1.0000000 0.0000000
|
||||
1.0000000 0.5000000
|
||||
0.0000000 1.0000000
|
||||
Generate winogradTransformSource2_2_0.5.cl
|
||||
Generate winogradTransformDest2_2_0.5.cl
|
||||
```
|
||||
|
||||
## winogradGenerateGLSL.out
|
||||
### 说明
|
||||
生成winograd变换矩阵程序,并生成opengl转换代码
|
||||
### 参数
|
||||
`./winogradExample.out unit kernelSize`
|
||||
- `unit:int` 分块大小
|
||||
- `kernelSize:int` 卷积核大小
|
||||
### 示例
|
||||
```bash
|
||||
$ ./winogradGenerateGLSL.out 1 2
|
||||
A
|
||||
1.0000000
|
||||
B
|
||||
1.0000000 -0.0000000
|
||||
0.0000000 1.0000000
|
||||
G
|
||||
1.0000000
|
||||
Generate winogradTransformSource1_2_0.5.comp
|
||||
Generate winogradTransformDest1_2_0.5.comp
|
||||
```
|
||||
|
|
|
@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
|
|||
endif()
|
||||
|
||||
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
|
||||
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
|
||||
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
|
||||
IF (MNN_EXPR_ENABLE_PROFILER)
|
||||
add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
|
||||
ENDIF()
|
||||
IF (MNN_EXPR_SHAPE_EAGER)
|
||||
add_definitions(-DMNN_EXPR_SHAPE_EAGER)
|
||||
ENDIF()
|
||||
|
|
|
@ -21,55 +21,9 @@
|
|||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
#define MNN_EXPRESS_ERROR_REPORT
|
||||
#endif
|
||||
#define MNN_EXPRESS_OPEN_MEMORY_REUSE
|
||||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
class Executor::Profiler {
|
||||
public:
|
||||
void reset();
|
||||
void dump() const;
|
||||
void add(const std::string& opType, float timeInMs);
|
||||
void addFlops(const std::string& opType, float flops);
|
||||
private:
|
||||
std::map<std::string, float> mTimes;
|
||||
std::map<std::string, float> mFlops;
|
||||
};
|
||||
void Executor::Profiler::reset() {
|
||||
mTimes.clear();
|
||||
mFlops.clear();
|
||||
}
|
||||
void Executor::Profiler::dump() const {
|
||||
float sumValue = 0.0f;
|
||||
for (auto iter : mTimes) {
|
||||
MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
|
||||
sumValue += iter.second;
|
||||
}
|
||||
MNN_PRINT("Total: %f ms\n", sumValue);
|
||||
sumValue = 0.0f;
|
||||
for (auto iter : mFlops) {
|
||||
MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
|
||||
sumValue += iter.second;
|
||||
}
|
||||
MNN_PRINT("Total flops: %f M\n", sumValue);
|
||||
}
|
||||
void Executor::Profiler::add(const std::string& opType, float timeInMs) {
|
||||
auto iter = mTimes.find(opType);
|
||||
if (iter == mTimes.end()) {
|
||||
mTimes[opType] = timeInMs;
|
||||
return;
|
||||
}
|
||||
iter->second += timeInMs;
|
||||
}
|
||||
void Executor::Profiler::addFlops(const std::string& opType, float flops) {
|
||||
auto iter = mFlops.find(opType);
|
||||
if (iter == mFlops.end()) {
|
||||
mFlops[opType] = flops;
|
||||
return;
|
||||
}
|
||||
iter->second += flops;
|
||||
}
|
||||
#endif
|
||||
|
||||
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
|
||||
std::lock_guard<std::mutex> _l(mMutex);
|
||||
|
@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
|
|||
//FUNC_PRINT(mCaches.size());
|
||||
_makeCache(expr, forceCPU);
|
||||
}
|
||||
void Executor::addOpCostTime(int op, float costTime) {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
auto opType = MNN::EnumNameOpType((OpType)op);
|
||||
if (nullptr == opType) {
|
||||
return;
|
||||
}
|
||||
mProfiler->add(opType, costTime);
|
||||
#endif
|
||||
}
|
||||
void Executor::addOpCostTime(const std::string& type, float costTime) {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
mProfiler->add(type, costTime);
|
||||
#endif
|
||||
}
|
||||
void Executor::addOpFlops(const std::string& type, float flops) {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
mProfiler->addFlops(type, flops);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void Executor::resetProfile() {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
mProfiler->reset();
|
||||
#endif
|
||||
// Depercated
|
||||
}
|
||||
void Executor::dumpProfile() {
|
||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||
mProfiler->dump();
|
||||
#endif
|
||||
// Depercated
|
||||
}
|
||||
|
||||
bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "NMSModule.hpp"
|
||||
#include "Utils.hpp"
|
||||
#include "core/Backend.hpp"
|
||||
#include "core/WrapExecution.hpp"
|
||||
#include "utils/InitNet.hpp"
|
||||
#include "RuntimeAttr.hpp"
|
||||
#include "geometry/GeometryComputer.hpp"
|
||||
|
@ -490,7 +491,15 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
|
|||
return submodule;
|
||||
}
|
||||
|
||||
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config& config, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, bool needGeometry) {
|
||||
struct ModuleRuntimeConfig {
|
||||
bool needGeometry;
|
||||
RuntimeInfo rt;
|
||||
Backend::Info compute;
|
||||
const BackendConfig* userConfig = nullptr;
|
||||
Session::ModeGroup modes;
|
||||
};
|
||||
|
||||
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, const Module::Config& config, const ModuleRuntimeConfig& runtimeConfig) {
|
||||
auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
|
||||
if (1 == info.opList.size()) {
|
||||
auto op = net->oplists()->GetAs<Op>(info.opList[0]);
|
||||
|
@ -506,9 +515,8 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
|
|||
// MNN_ASSERT(false);
|
||||
}
|
||||
Schedule::ScheduleInfo scheduleInfo;
|
||||
RuntimeInfo rt;
|
||||
Session::ModeGroup modes;
|
||||
scheduleInfo.defaultBackend = sharedConst->defaultBackend;
|
||||
scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
|
||||
scheduleInfo.allTensors = sharedConst->allTensors;
|
||||
initTensors(scheduleInfo.allTensors, net);
|
||||
std::vector<Schedule::OpCacheInfo> oplists;
|
||||
|
@ -522,34 +530,19 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
|
|||
if (breakIndex >= 0) {
|
||||
scheduleInfo.needInputContentForShape = true;
|
||||
}
|
||||
Backend::Info compute;
|
||||
const BackendConfig* userConfig = nullptr;
|
||||
if (nullptr == rtMgr) {
|
||||
rt = Executor::getRuntime();
|
||||
auto glo = ExecutorScope::Current();
|
||||
compute.type = glo->getAttr()->firstType.first;
|
||||
compute.numThread = glo->getAttr()->firstType.second;
|
||||
} else {
|
||||
modes = rtMgr->getInside()->modes;
|
||||
rt = rtMgr->getInside()->mRuntime;
|
||||
userConfig = &rtMgr->getInside()->mConfig;
|
||||
compute.type = rt.first.begin()->first;
|
||||
compute.numThread = 1;
|
||||
// set external file info
|
||||
if (!rtMgr->getInside()->mExternalFile.empty()) {
|
||||
rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||
rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||
}
|
||||
}
|
||||
auto rt = runtimeConfig.rt;
|
||||
auto modes = runtimeConfig.modes;
|
||||
Schedule::BackendCache bnCache;
|
||||
if (nullptr != userConfig) {
|
||||
bnCache.config = *userConfig;
|
||||
Backend::Info compute = runtimeConfig.compute;
|
||||
if (nullptr != runtimeConfig.userConfig) {
|
||||
bnCache.config = *runtimeConfig.userConfig;
|
||||
compute.user = &bnCache.config;
|
||||
} else {
|
||||
compute.user = nullptr;
|
||||
}
|
||||
bnCache.info = std::move(compute);
|
||||
bnCache.needComputeGeometry = needGeometry;
|
||||
bnCache.needComputeGeometry = runtimeConfig.needGeometry;
|
||||
|
||||
scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists)));
|
||||
|
||||
std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage};
|
||||
|
@ -588,13 +581,38 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
|||
// Extra Const Tensors
|
||||
sharedConst.reset(new Schedule::ScheduleInfo);
|
||||
auto curExe = ExecutorScope::Current();
|
||||
bool permitCodeGen = false;
|
||||
if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) {
|
||||
curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||
permitCodeGen = rtMgr->getInside()->modes.codegenMode == Interpreter::Session_Codegen_Enable;
|
||||
}
|
||||
std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
|
||||
std::vector<std::shared_ptr<Tensor>> allTensors;
|
||||
sharedConst->allTensors.resize(net->tensorName()->size());
|
||||
sharedConst->defaultBackend = defaultBackend;
|
||||
std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
|
||||
ModuleRuntimeConfig& modRuntime = *modRuntimeCfgPtr;
|
||||
modRuntime.needGeometry = needGeometry;
|
||||
if (nullptr == rtMgr) {
|
||||
modRuntime.rt = Executor::getRuntime();
|
||||
auto glo = ExecutorScope::Current();
|
||||
modRuntime.compute.type = glo->getAttr()->firstType.first;
|
||||
modRuntime.compute.numThread = glo->getAttr()->firstType.second;
|
||||
} else {
|
||||
modRuntime.modes = rtMgr->getInside()->modes;
|
||||
modRuntime.rt = rtMgr->getInside()->mRuntime;
|
||||
modRuntime.userConfig = &rtMgr->getInside()->mConfig;
|
||||
modRuntime.compute.type = modRuntime.rt.first.begin()->first;
|
||||
modRuntime.compute.numThread = 1;
|
||||
// set external file info
|
||||
if (!rtMgr->getInside()->mExternalFile.empty()) {
|
||||
modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||
modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||
}
|
||||
}
|
||||
auto& rt = modRuntime.rt;
|
||||
auto firstRt = rt.first[modRuntime.compute.type];
|
||||
sharedConst->constReplaceBackend.reset(firstRt->onCreate(modRuntime.userConfig));
|
||||
ErrorCode code = NO_ERROR;
|
||||
std::set<int> noneedComputeIndexes;
|
||||
initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code);
|
||||
|
@ -646,7 +664,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
|||
auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
|
||||
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
|
||||
for (int i=0; i<subModulesInfo.size(); ++i) {
|
||||
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, rtMgr, *config, sharedConst, needGeometry));
|
||||
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
|
||||
}
|
||||
auto result = new PipelineModule;
|
||||
result->mInputSize = inputs.size();
|
||||
|
@ -702,8 +720,45 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
|||
}
|
||||
result->registerModel(subModules);
|
||||
result->mSharedConst = sharedConst;
|
||||
if (!permitCodeGen) {
|
||||
// Prereplace const tensor
|
||||
auto curBackend = sharedConst->constReplaceBackend.get();
|
||||
if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
|
||||
for (auto& t : sharedConst->allTensors) {
|
||||
if (nullptr == t.get()) {
|
||||
continue;
|
||||
}
|
||||
auto des = TensorUtils::getDescribe(t.get());
|
||||
if (des->isMutable) {
|
||||
continue;
|
||||
}
|
||||
if (!WrapExecution::needWrap(t.get(), curBackend)) {
|
||||
continue;
|
||||
}
|
||||
if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
|
||||
continue;
|
||||
}
|
||||
if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
|
||||
continue;
|
||||
}
|
||||
std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
|
||||
auto outDes = TensorUtils::getDescribe(wrapTensor.get());
|
||||
outDes->usage = des->usage;
|
||||
auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC);
|
||||
if (!tempRes) {
|
||||
continue;
|
||||
}
|
||||
outDes->setBackend(curBackend);
|
||||
curBackend->onCopyBuffer(t.get(), wrapTensor.get());
|
||||
outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
|
||||
TensorUtils::getDescribeOrigin(t.get())->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent;
|
||||
t->buffer().host = wrapTensor->buffer().host;
|
||||
t->buffer().device = wrapTensor->buffer().device;
|
||||
t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
Module* PipelineModule::clone(CloneContext* ctx) const {
|
||||
|
|
|
@ -430,6 +430,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
|
||||
} else if (backend == mResource->mSharedConst->defaultBackend.get()) {
|
||||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
|
||||
} else if (backend == mResource->mSharedConst->constReplaceBackend.get()) {
|
||||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -195,6 +195,7 @@ public:
|
|||
MAX_TUNING_NUMBER = 0,
|
||||
// Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid
|
||||
STRICT_CHECK_MODEL = 1,
|
||||
MEM_ALLOCATOR_TYPE = 2,
|
||||
};
|
||||
/**
|
||||
* @brief The API shoud be called before create session.
|
||||
|
|
|
@ -68,7 +68,7 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
|||
#define STR_IMP(x) #x
|
||||
#define STR(x) STR_IMP(x)
|
||||
#define MNN_VERSION_MAJOR 2
|
||||
#define MNN_VERSION_MINOR 6
|
||||
#define MNN_VERSION_PATCH 3
|
||||
#define MNN_VERSION_MINOR 7
|
||||
#define MNN_VERSION_PATCH 0
|
||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||
#endif /* MNNDefine_h */
|
||||
|
|
|
@ -68,11 +68,6 @@ public:
|
|||
struct SubGraph;
|
||||
bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs);
|
||||
std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName);
|
||||
/**Internal Usage Begin*/
|
||||
void addOpCostTime(int op, float costTime);
|
||||
void addOpCostTime(const std::string& type, float costTime);
|
||||
void addOpFlops(const std::string& type, float flops);
|
||||
/**Internal Usage End*/
|
||||
static RuntimeInfo getRuntime();
|
||||
void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after);
|
||||
const DebugTools* getDebugTools() const {
|
||||
|
|
|
@ -50,7 +50,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
|
|||
}
|
||||
|
||||
CPURuntime::CPURuntime(const Backend::Info& info) {
|
||||
mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault()));
|
||||
mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
|
||||
mThreadNumber = info.numThread;
|
||||
mThreadNumber = std::max(1, mThreadNumber);
|
||||
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
||||
|
@ -64,6 +64,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
|
|||
mMemory = info.user->memory;
|
||||
mFlags = info.user->flags;
|
||||
}
|
||||
mAllocator = info.allocator;
|
||||
|
||||
#ifdef _OPENMP
|
||||
switch (mPower) {
|
||||
|
@ -218,7 +219,11 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
|
|||
mMemory = memory;
|
||||
mRuntime = const_cast<CPURuntime*>(runtime);
|
||||
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
|
||||
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
|
||||
if (mRuntime->getAllocatorType() == Runtime::Allocator_Defer) {
|
||||
mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
|
||||
} else {
|
||||
mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
|
||||
}
|
||||
mStaticAllocator = runtime->mStaticAllocator;
|
||||
mPrecisionMode = precision;
|
||||
mCoreFunctions = MNNGetCoreFunctions();
|
||||
|
@ -238,24 +243,14 @@ void CPUBackend::onExecuteEnd() const {
|
|||
mRuntime->onConcurrencyEnd();
|
||||
}
|
||||
|
||||
class CPUMemObj : public Backend::MemObj {
|
||||
public:
|
||||
CPUMemObj(BufferAllocator* allocator, std::pair<void*, int> points, int size) {
|
||||
mPoint = std::move(points);
|
||||
mAllocator = allocator;
|
||||
mSize = size;
|
||||
void CPUBackend::onResizeBegin() {
|
||||
mDynamicAllocator->reset();
|
||||
}
|
||||
virtual ~ CPUMemObj() {
|
||||
mAllocator->free(mPoint);
|
||||
|
||||
void CPUBackend::onResizeEnd() {
|
||||
getCache()->release();
|
||||
mDynamicAllocator->compute();
|
||||
}
|
||||
inline int getSize() const {
|
||||
return mSize;
|
||||
}
|
||||
private:
|
||||
BufferAllocator* mAllocator;
|
||||
std::pair<void*, int> mPoint;
|
||||
int mSize;
|
||||
};
|
||||
|
||||
Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
|
||||
auto originMem = TensorUtils::getDescribe(dest)->mem.get();
|
||||
|
@ -277,35 +272,41 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
|
|||
// }
|
||||
auto& buffer = dest->buffer();
|
||||
auto des = TensorUtils::getDescribe(dest);
|
||||
std::pair<void*, int> points;
|
||||
MemChunk chunk;
|
||||
switch (storageType) {
|
||||
case STATIC: {
|
||||
points = mStaticAllocator->alloc(size, false);
|
||||
chunk = mStaticAllocator->alloc(size, false);
|
||||
break;
|
||||
}
|
||||
case DYNAMIC: {
|
||||
points = mDynamicAllocator->alloc(size, false);
|
||||
chunk = mDynamicAllocator->alloc(size, false);
|
||||
break;
|
||||
}
|
||||
case DYNAMIC_SEPERATE: {
|
||||
points = mDynamicAllocator->alloc(size, true);
|
||||
chunk = mDynamicAllocator->alloc(size, true);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
if (nullptr == points.first) {
|
||||
|
||||
if (chunk.invalid()) {
|
||||
MNN_ERROR("Alloc buffer error for cpu backend\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Backend::MemObj* res = nullptr;
|
||||
|
||||
if (storageType == STATIC) {
|
||||
res = new CPUMemObj(mStaticAllocator.get(), points, size);
|
||||
res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
|
||||
} else {
|
||||
res = new CPUMemObj(mDynamicAllocator.get(), points, size);
|
||||
res = new CPUMemObj(mDynamicAllocator.get(), chunk, size);
|
||||
chunk.attach(dest);
|
||||
}
|
||||
if (chunk.ptr()) {
|
||||
buffer.host = chunk.ptr();
|
||||
}
|
||||
buffer.host = (uint8_t*)points.first + points.second;
|
||||
des->extra.offset = 0;
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -13,10 +13,10 @@
|
|||
#include <memory>
|
||||
#include "core/Backend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "MNN_generated.h"
|
||||
|
||||
namespace MNN {
|
||||
class BufferAllocator;
|
||||
class CPURuntime : public Runtime {
|
||||
public:
|
||||
friend class CPUBackend;
|
||||
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
||||
int mThreadNumber;
|
||||
mutable int mTaskIndex;
|
||||
BackendConfig::MemoryMode mMemory;
|
||||
|
@ -47,11 +47,31 @@ private:
|
|||
float mFlops = 0.0f;
|
||||
static Backend*(*gExtraCreate)(const Runtime* runtime);
|
||||
size_t mFlags = 0;
|
||||
int mAllocator = 0;
|
||||
};
|
||||
struct CoreFunctions;
|
||||
struct CoreInt8Functions;
|
||||
|
||||
class CPUResizeCache;
|
||||
class CPUMemObj : public Backend::MemObj {
|
||||
public:
|
||||
CPUMemObj(BufferAllocator* allocator, MemChunk chunk, int size) : mAllocator(allocator), mChunk(chunk), mSize(size) {}
|
||||
virtual ~ CPUMemObj() {
|
||||
if (mAllocator) {
|
||||
mAllocator->free(mChunk);
|
||||
}
|
||||
}
|
||||
virtual MemChunk chunk() {
|
||||
return mChunk;
|
||||
}
|
||||
inline int getSize() const {
|
||||
return mSize;
|
||||
}
|
||||
private:
|
||||
BufferAllocator* mAllocator;
|
||||
MemChunk mChunk;
|
||||
int mSize;
|
||||
};
|
||||
class CPUBackend : public Backend {
|
||||
public:
|
||||
CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0);
|
||||
|
@ -70,6 +90,9 @@ public:
|
|||
virtual void onExecuteBegin() const override;
|
||||
virtual void onExecuteEnd() const override;
|
||||
|
||||
virtual void onResizeBegin() override;
|
||||
virtual void onResizeEnd() override;
|
||||
|
||||
const CoreFunctions* functions() const {
|
||||
return mCoreFunctions;
|
||||
}
|
||||
|
@ -91,7 +114,7 @@ public:
|
|||
return mRuntime->mThreadNumber;
|
||||
}
|
||||
|
||||
BufferAllocator* getBufferAllocator() const {
|
||||
BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
|
||||
return mDynamicAllocator.get();
|
||||
}
|
||||
|
||||
|
@ -120,7 +143,7 @@ protected:
|
|||
const CoreFunctions* mCoreFunctions;
|
||||
const CoreInt8Functions* mInt8CoreFunctions;
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
||||
CPURuntime* mRuntime;
|
||||
BackendConfig::PrecisionMode mPrecisionMode;
|
||||
|
|
|
@ -208,9 +208,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
|
|||
}
|
||||
}
|
||||
};
|
||||
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
|
||||
auto biasP = inputs[2]->host<uint8_t>();
|
||||
auto weightP = inputs[1]->host<uint8_t>();
|
||||
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
|
||||
for (int index = tId; index < total; index += numberThread) {
|
||||
int dz = index / batch;
|
||||
auto dst_z = dstOrigin + dst_z_step * index * bytes;
|
||||
|
|
|
@ -241,6 +241,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
int bytes = core->bytes;
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto oc = output->channel();
|
||||
|
@ -270,6 +271,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
mPostFunctions.clear();
|
||||
auto plane = width * height * batch;
|
||||
const int maxDepth = 5;
|
||||
auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
//int zeroPoint = 0;
|
||||
|
||||
auto biasPtr = inputs[2]->host<float>();
|
||||
|
@ -284,6 +286,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
auto zeroPoint = outputQuant[1];
|
||||
|
||||
AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
|
||||
bool needReleaseTempInput = true;
|
||||
int outi8 = 0;
|
||||
if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
|
||||
outi8 = 1;
|
||||
|
@ -306,28 +309,28 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
return OUT_OF_MEMORY;
|
||||
}
|
||||
mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
|
||||
tempInput->buffer().host = (uint8_t*)inputPtr;
|
||||
// tempInput->buffer().host = (uint8_t*)inputPtr;
|
||||
|
||||
needReleaseTempInput = false;
|
||||
TensorUtils::getDescribe(tempInput.get())->mem.reset(new CPUMemObj(nullptr, TensorUtils::getDescribe(input)->mem->chunk(), 0));
|
||||
mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
|
||||
}
|
||||
auto colBufferPtr = mTempOutput->host<uint8_t>();
|
||||
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
||||
std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
|
||||
|
||||
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
|
||||
auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
auto outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
|
||||
if (outputFp32Ptr.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto outputFp32Ptr = OutputFloat->host<uint8_t>();
|
||||
|
||||
mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
|
||||
mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
|
||||
strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales,
|
||||
minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
|
||||
auto colBufferPtr = mTempOutput->host<uint8_t>();
|
||||
auto unitBytes = core->pack * core->bytes;
|
||||
auto tempOutPtr = outputPtr;
|
||||
auto float2Int8_step = src_height * src_width * batch;
|
||||
if (outi8) {
|
||||
tempOutPtr = outputFp32Ptr;
|
||||
tempOutPtr = outputFp32Ptr.ptr();
|
||||
}
|
||||
for (int z = (tId); z < ocC4; z += threadNumber) {
|
||||
auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
|
||||
|
@ -367,9 +370,18 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
|||
}
|
||||
}
|
||||
}, threadNumber));
|
||||
/*
|
||||
if (TensorUtils::getDescribe(tempInput.get())->mem->chunk().offset() != TensorUtils::getDescribe(input)->mem->chunk().offset()) {
|
||||
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
if (tempInput->host<float>() != inputPtr) {
|
||||
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
*/
|
||||
allocator->free(outputFp32Ptr);
|
||||
if (needReleaseTempInput) {
|
||||
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||
}
|
||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
|
@ -7,51 +7,26 @@
|
|||
//
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "backend/cpu/CPULayerNorm.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "core/Execution.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/OpCommonUtils.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||
#include "MNN_generated.h"
|
||||
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class CPULayerNorm : public Execution {
|
||||
public:
|
||||
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
|
||||
virtual ~CPULayerNorm();
|
||||
|
||||
ErrorCode onExecute(const std::vector<Tensor*> &inputs, // NOLINT
|
||||
const std::vector<Tensor*> &outputs) override;
|
||||
|
||||
ErrorCode onResize(const std::vector<Tensor*> &inputs, // NOLINT
|
||||
const std::vector<Tensor*> &outputs) override;
|
||||
private:
|
||||
bool allocGammaBeta(int size);
|
||||
private:
|
||||
int axis_size = 0;
|
||||
int inner_size_ = 1;
|
||||
int outter_size_ = 1;
|
||||
int group_ = 1;
|
||||
float epsilon_ = 0.001;
|
||||
|
||||
std::unique_ptr<Tensor> gamma_;
|
||||
std::unique_ptr<Tensor> beta_;
|
||||
bool has_gamma_beta_ = false;
|
||||
};
|
||||
|
||||
bool CPULayerNorm::allocGammaBeta(int size) {
|
||||
has_gamma_beta_ = true;
|
||||
gamma_.reset(Tensor::createDevice<float>({size}));
|
||||
auto status = backend()->onAcquireBuffer(gamma_.get(), Backend::STATIC);
|
||||
mIniGammaBeta = true;
|
||||
mGamma.reset(Tensor::createDevice<float>({size}));
|
||||
auto status = backend()->onAcquireBuffer(mGamma.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
|
||||
return false;
|
||||
}
|
||||
beta_.reset(Tensor::createDevice<float>({size}));
|
||||
status = backend()->onAcquireBuffer(beta_.get(), Backend::STATIC);
|
||||
mBeta.reset(Tensor::createDevice<float>({size}));
|
||||
status = backend()->onAcquireBuffer(mBeta.get(), Backend::STATIC);
|
||||
if (!status) {
|
||||
MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n");
|
||||
return false;
|
||||
|
@ -59,17 +34,16 @@ bool CPULayerNorm::allocGammaBeta(int size) {
|
|||
return true;
|
||||
}
|
||||
|
||||
CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
|
||||
: Execution(backend) {
|
||||
CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend) : Execution(backend) {
|
||||
const auto* layer_norm_param = op->main_as_LayerNorm();
|
||||
axis_size = layer_norm_param->axis()->size();
|
||||
group_ = layer_norm_param->group();
|
||||
epsilon_ = layer_norm_param->epsilon();
|
||||
mAxis = layer_norm_param->axis()->size();
|
||||
mGroup = layer_norm_param->group();
|
||||
mEpsilon = layer_norm_param->epsilon();
|
||||
|
||||
if (USE_EXTERNAL_DATA(layer_norm_param)) {
|
||||
auto size = layer_norm_param->external()->Get(1);
|
||||
int32_t size = static_cast<int32_t>(layer_norm_param->external()->Get(1));
|
||||
allocGammaBeta(size);
|
||||
OpCommonUtils::loadExternalDatas(backend, {gamma_->host<char>(), beta_->host<char>()}, layer_norm_param->external()->data());
|
||||
OpCommonUtils::loadExternalDatas(backend, {mGamma->host<char>(), mBeta->host<char>()}, layer_norm_param->external()->data());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -80,23 +54,44 @@ CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
|
|||
}
|
||||
allocGammaBeta(size);
|
||||
const float* gamma_data = layer_norm_param->gamma()->data();
|
||||
memcpy(gamma_->host<float>(), gamma_data, size * sizeof(float));
|
||||
memcpy(mGamma->host<float>(), gamma_data, size * sizeof(float));
|
||||
const float* beta_data = layer_norm_param->beta()->data();
|
||||
memcpy(beta_->host<float>(), beta_data, size * sizeof(float));
|
||||
memcpy(mBeta->host<float>(), beta_data, size * sizeof(float));
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
||||
const std::vector<Tensor*> &outputs) {
|
||||
const float* gamma = has_gamma_beta_ ? gamma_->host<float>() : nullptr;
|
||||
const float* beta = has_gamma_beta_ ? beta_->host<float>() : nullptr;
|
||||
const float* gamma = mIniGammaBeta ? mGamma->host<float>() : nullptr;
|
||||
const float* beta = mIniGammaBeta ? mBeta->host<float>() : nullptr;
|
||||
|
||||
if (mInpZero.data()) {
|
||||
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||
|
||||
const int8_t* input = inputs[0]->host<int8_t>();
|
||||
int8_t* output = outputs[0]->host<int8_t>();
|
||||
MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
|
||||
QuanPrePostParameters params;
|
||||
params.maxValue = mMaxMinValue[0];
|
||||
params.minValue = mMaxMinValue[1];
|
||||
params.inputScale = mInpScale.data();
|
||||
params.outputScale = mOutScale.data();
|
||||
params.inputZeroPoint = mInpZero.data();
|
||||
params.outputZeroPoint = mOutZero.data();
|
||||
const int8_t* inner_input = input + tId * mInnerSize;
|
||||
int8_t* inner_output = output + tId * mInnerSize;
|
||||
core->MNNNormInt8(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize, ¶ms);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
const float* input = inputs.at(0)->host<float>();
|
||||
float* output = outputs.at(0)->host<float>();
|
||||
MNN_CONCURRENCY_BEGIN(tId, outter_size_) {
|
||||
const float* inner_input = input + tId * inner_size_;
|
||||
float* inner_output = output + tId * inner_size_;
|
||||
MNNNorm(inner_output, inner_input, gamma, beta, epsilon_, inner_size_);
|
||||
MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
|
||||
const float* inner_input = input + tId * mInnerSize;
|
||||
float* inner_output = output + tId * mInnerSize;
|
||||
MNNNorm(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize);
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
return NO_ERROR;
|
||||
|
@ -104,40 +99,53 @@ ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
|||
|
||||
ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
|
||||
const std::vector<Tensor*> &outputs) {
|
||||
outter_size_ = 1;
|
||||
inner_size_ = 1;
|
||||
mOutterSize = 1;
|
||||
mInnerSize = 1;
|
||||
int rank = inputs.at(0)->dimensions();
|
||||
if (group_ > 1) {
|
||||
outter_size_ = inputs.at(0)->length(0) * group_;
|
||||
if (mGroup > 1) {
|
||||
mOutterSize = inputs.at(0)->length(0) * mGroup;
|
||||
for (int i = 1; i < rank; i++) {
|
||||
inner_size_ *= inputs.at(0)->length(i);
|
||||
mInnerSize *= inputs.at(0)->length(i);
|
||||
}
|
||||
inner_size_ /= group_;
|
||||
mInnerSize /= mGroup;
|
||||
return NO_ERROR;
|
||||
}
|
||||
for (int i = 0; i < rank - axis_size; ++i) {
|
||||
outter_size_ *= inputs.at(0)->length(i);
|
||||
for (int i = 0; i < rank - mAxis; ++i) {
|
||||
mOutterSize *= inputs.at(0)->length(i);
|
||||
}
|
||||
for (int i = rank - axis_size; i < rank; ++i) {
|
||||
inner_size_ *= inputs.at(0)->length(i);
|
||||
for (int i = rank - mAxis; i < rank; ++i) {
|
||||
mInnerSize *= inputs.at(0)->length(i);
|
||||
}
|
||||
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||
mInpZero.resize(1);
|
||||
mOutZero.resize(1);
|
||||
mInpScale.resize(1);
|
||||
mOutScale.resize(1);
|
||||
mMaxMinValue.resize(2);
|
||||
auto inpQuantAttr = TensorUtils::getDescribe(inputs[0])->quantAttr;
|
||||
auto outQuantAttr = TensorUtils::getDescribe(outputs[0])->quantAttr;
|
||||
mInpZero[0] = inpQuantAttr->zero;
|
||||
mOutZero[0] = outQuantAttr->zero;
|
||||
mInpScale[0] = inpQuantAttr->scale;
|
||||
mOutScale[0] = outQuantAttr->scale == 0.f? 0.f : 1.0f / outQuantAttr->scale;
|
||||
mMaxMinValue[0] = outQuantAttr->max;
|
||||
mMaxMinValue[1] = outQuantAttr->min;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
CPULayerNorm::~CPULayerNorm() {
|
||||
if (gamma_.get()) {
|
||||
backend()->onReleaseBuffer(gamma_.get(), Backend::STATIC);
|
||||
if (mGamma.get()) {
|
||||
backend()->onReleaseBuffer(mGamma.get(), Backend::STATIC);
|
||||
}
|
||||
if (beta_.get()) {
|
||||
backend()->onReleaseBuffer(beta_.get(), Backend::STATIC);
|
||||
if (mBeta.get()) {
|
||||
backend()->onReleaseBuffer(mBeta.get(), Backend::STATIC);
|
||||
}
|
||||
}
|
||||
|
||||
class CPULayerNormCreator : public CPUBackend::Creator {
|
||||
public:
|
||||
Execution* onCreate(const std::vector<Tensor*>& inputs,
|
||||
const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override {
|
||||
return new CPULayerNorm(op, backend);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
//
|
||||
// CPULayerNorm.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/07/11
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef CPULayerNorm_hpp
|
||||
#define CPULayerNorm_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
#include "core/Macro.h"
|
||||
namespace MNN {
|
||||
class CPULayerNorm : public Execution {
|
||||
public:
|
||||
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
|
||||
virtual ~CPULayerNorm();
|
||||
|
||||
ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
|
||||
ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
|
||||
private:
|
||||
bool allocGammaBeta(int size);
|
||||
private:
|
||||
int mAxis = 0;
|
||||
int mInnerSize = 1;
|
||||
int mOutterSize = 1;
|
||||
int mGroup = 1;
|
||||
float mEpsilon = 0.001;
|
||||
std::unique_ptr<Tensor> mGamma;
|
||||
std::unique_ptr<Tensor> mBeta;
|
||||
bool mIniGammaBeta = false;
|
||||
// LayerNormInt8 parameters.
|
||||
std::vector<float> mInpScale;
|
||||
std::vector<float> mOutScale;
|
||||
std::vector<ssize_t> mInpZero;
|
||||
std::vector<ssize_t> mOutZero;
|
||||
std::vector<ssize_t> mMaxMinValue;
|
||||
};
|
||||
} // namespace MNN
|
||||
#endif /* CPULayerNorm_hpp */
|
|
@ -14,6 +14,7 @@
|
|||
#include "core/Macro.h"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "math/Vec.hpp"
|
||||
|
||||
|
||||
|
@ -94,40 +95,36 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes);
|
||||
auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes);
|
||||
auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes);
|
||||
if (nullptr == ATPtrAlloc.first || nullptr == BTPtrAlloc.first || nullptr == CTPtrAlloc.first) {
|
||||
if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto BTPtr = (uint8_t*)BTPtrAlloc.first + BTPtrAlloc.second;
|
||||
auto ATPtr = (uint8_t*)ATPtrAlloc.first + ATPtrAlloc.second;
|
||||
auto CTPtr = (uint8_t*)CTPtrAlloc.first + CTPtrAlloc.second;
|
||||
|
||||
float* BTempPtr = (float*)BTPtr;
|
||||
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
||||
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
|
||||
mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB);
|
||||
} , 1));
|
||||
if (mTransposeA) {
|
||||
// l, e -> lC4, e, 4
|
||||
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
mPreFunctions.emplace_back(std::make_pair([ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
int offset[] = {
|
||||
e, e
|
||||
};
|
||||
core->MNNPackCUnit((float*)ATPtr, APtr, e, l, offset);
|
||||
core->MNNPackCUnit((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
|
||||
}, 1));
|
||||
} else {
|
||||
// e, l -> lC4, e, 4
|
||||
mPreFunctions.emplace_back(std::make_pair(
|
||||
[ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
[ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||
int offset[] = {
|
||||
e, e
|
||||
};
|
||||
core->MNNPackCUnitTranspose((float*)ATPtr, APtr, e, l, offset);
|
||||
core->MNNPackCUnitTranspose((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
|
||||
}, 1));
|
||||
}
|
||||
bool useBias = false;
|
||||
uint8_t* biasPtr = nullptr;
|
||||
std::vector<float> postParameters;
|
||||
std::pair<void*, int> bdestAlloc = std::make_pair(nullptr, 0);
|
||||
MemChunk bdestAlloc;
|
||||
bool bdestNeedFree = false;
|
||||
if (inputs.size() > 2) {
|
||||
auto bias = inputs[2];
|
||||
useBias = true;
|
||||
|
@ -136,19 +133,20 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
mStrassenUseBiasDirectly = false;
|
||||
// Padding to align of 4
|
||||
bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
|
||||
if (bdestAlloc.first == nullptr) {
|
||||
bdestNeedFree = true;
|
||||
if (bdestAlloc.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto bdest = (float*)((uint8_t*)bdestAlloc.first + bdestAlloc.second);
|
||||
mPreFunctions.emplace_back(std::make_pair(
|
||||
[biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
|
||||
::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
|
||||
::memcpy(bdest, borigin, biasLength * core->bytes);
|
||||
[biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
|
||||
::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
|
||||
::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes);
|
||||
}, 1));
|
||||
biasPtr = (uint8_t*)bdest;
|
||||
} else {
|
||||
mStrassenUseBiasDirectly = true;
|
||||
biasPtr = bias->host<uint8_t>();
|
||||
if (TensorUtils::getDescribe(bias)->mem.get()) {
|
||||
bdestAlloc = TensorUtils::getDescribe(bias)->mem->chunk();
|
||||
}
|
||||
}
|
||||
postParameters = {
|
||||
1.0f,
|
||||
|
@ -157,29 +155,29 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
std::numeric_limits<float>().max(),
|
||||
};
|
||||
}
|
||||
auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtr, BTPtr, CTPtr, useBias, biasPtr, postParameters);
|
||||
auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtrAlloc, BTPtrAlloc, CTPtrAlloc, useBias, bdestAlloc, postParameters);
|
||||
if (NO_ERROR != code) {
|
||||
return code;
|
||||
}
|
||||
if (bdestAlloc.first != nullptr) {
|
||||
if (bdestNeedFree) {
|
||||
bufferAlloc->free(bdestAlloc);
|
||||
}
|
||||
// hC4, e, 4 -> e, h
|
||||
if (mTransposeC) {
|
||||
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
|
||||
mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
|
||||
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
||||
int offset[] = {
|
||||
e, e
|
||||
};
|
||||
core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtr, e, h, offset);
|
||||
core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
|
||||
}, 1));
|
||||
} else {
|
||||
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
|
||||
mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
|
||||
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
||||
int offset[] = {
|
||||
e, e
|
||||
};
|
||||
core->MNNUnpackCUnit(CPtr, (float*)CTPtr, e, h, offset);
|
||||
core->MNNUnpackCUnit(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
|
||||
}, 1));
|
||||
}
|
||||
bufferAlloc->free(ATPtrAlloc);
|
||||
|
|
|
@ -55,8 +55,6 @@ public:
|
|||
padWidth = padHeight = 0;
|
||||
}
|
||||
auto totalDepth = input->batch() * UP_DIV(input->channel(), core->pack);
|
||||
auto inputData = input->host<uint8_t>();
|
||||
auto outputData = output->host<uint8_t>();
|
||||
auto inputPlaneStride = core->pack * input->width() * input->height();
|
||||
auto outputPlaneStride = core->pack * output->width() * output->height();
|
||||
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
||||
|
@ -67,6 +65,8 @@ public:
|
|||
}
|
||||
mFunction = std::make_pair(threadNumber, [=](int tId) {
|
||||
for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
|
||||
auto inputData = input->host<uint8_t>();
|
||||
auto outputData = output->host<uint8_t>();
|
||||
// run
|
||||
mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(),
|
||||
outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth,
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "core/Concurrency.h"
|
||||
#include "CPUTensorConvert.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
//#define MNN_OPEN_TIME_TRACE
|
||||
#include <MNN/AutoTime.hpp>
|
||||
namespace MNN {
|
||||
|
@ -101,26 +102,30 @@ static void pickBoxes(const std::vector<score_box_t> &boxes, std::vector<long> &
|
|||
}
|
||||
|
||||
ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
// score transform space
|
||||
auto &score = inputs[0];
|
||||
memcpy(mScore.buffer().dim, score->buffer().dim, sizeof(halide_dimension_t) * score->buffer().dimensions);
|
||||
backend()->onAcquireBuffer(&mScore, Backend::DYNAMIC);
|
||||
|
||||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||
mScoreBuffer = bufferAlloc->alloc(TensorUtils::getRawSize(inputs[0]) * inputs[0]->getType().bytes());
|
||||
if (mScoreBuffer.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
// release temp buffer space
|
||||
backend()->onReleaseBuffer(&mScore, Backend::DYNAMIC);
|
||||
bufferAlloc->free(mScoreBuffer);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
auto &imInfo = inputs[2];
|
||||
ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
// score transform space
|
||||
auto score = inputs[0];
|
||||
auto boxes = inputs[1];
|
||||
auto imInfo = inputs[2];
|
||||
auto featStride = mProposal->featStride();
|
||||
auto preNmsTopN = mProposal->preNmsTopN();
|
||||
auto nmsThreshold = mProposal->nmsThreshold();
|
||||
auto afterNmsTopN = mProposal->afterNmsTopN();
|
||||
auto minSize = mProposal->minSize();
|
||||
|
||||
auto boxes = inputs[1];
|
||||
|
||||
mRun = [=]() {
|
||||
float* tmpScorePtr = (float*)mScoreBuffer.ptr();
|
||||
// download
|
||||
MNNUnpackC4Origin(mScore.host<float>(), score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
|
||||
MNNUnpackC4Origin(tmpScorePtr, score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
|
||||
|
||||
auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight;
|
||||
auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight;
|
||||
|
@ -139,7 +144,7 @@ ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
{
|
||||
for (int ah = 0; ah < anchorHeight; ++ah) {
|
||||
auto boxPtr = boxes->host<float>() + ah * 4 * boxSize;
|
||||
auto scorePtr = mScore.host<float>() + (ah + anchorHeight) * scrSize;
|
||||
auto scorePtr = tmpScorePtr + (ah + anchorHeight) * scrSize;
|
||||
|
||||
// shifted anchor
|
||||
const auto anchor = mAnchors.get() + ah * anchorWidth;
|
||||
|
@ -220,12 +225,6 @@ ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
scoresPtr[0] = box_score(box);
|
||||
}
|
||||
}
|
||||
};
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
mRun();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <functional>
|
||||
#include "core/AutoStorage.h"
|
||||
#include "core/Execution.hpp"
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "MNN_generated.h"
|
||||
|
||||
namespace MNN {
|
||||
|
@ -26,8 +27,7 @@ public:
|
|||
private:
|
||||
const Proposal *mProposal;
|
||||
AutoStorage<float> mAnchors;
|
||||
Tensor mScore;
|
||||
std::function<void()> mRun;
|
||||
MemChunk mScoreBuffer;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
@ -68,7 +68,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
}
|
||||
Tensor::InsideDescribe::Region newRegion;
|
||||
OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
|
||||
mFastBlit.emplace_back(std::make_pair(slice.origin->host<void>(), std::move(newRegion)));
|
||||
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -98,12 +98,12 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
auto origin = slice.origin;
|
||||
if (nullptr == origin || nullptr == origin->host<void>()) {
|
||||
if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
|
||||
continue;
|
||||
}
|
||||
// if tensor is not NC4HW4 or has been merged, don't need deal
|
||||
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
||||
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), &slice));
|
||||
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
|
||||
continue;
|
||||
}
|
||||
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it
|
||||
|
@ -132,12 +132,13 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
|
||||
if (merge) {
|
||||
// cache the merged tensor
|
||||
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), newSlice.get()));
|
||||
mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
|
||||
mCacheRegions.emplace_back(newSlice);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
auto cache = static_cast<CPUBackend*>(backend())->getCache();
|
||||
#if 1
|
||||
auto tempTensor = cache->findCacheTensor(origin, midFormat);
|
||||
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
|
||||
if (nullptr == tempTensor) {
|
||||
|
@ -159,7 +160,23 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
|
||||
forRelease.emplace_back(tempTensor);
|
||||
}
|
||||
mTempInputCopy.emplace_back(std::make_pair(tempTensor->host<void>(), &slice));
|
||||
#else
|
||||
std::shared_ptr<Tensor> newTensor(new Tensor);
|
||||
TensorUtils::copyShape(origin, newTensor.get());
|
||||
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
|
||||
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
|
||||
newTensor->buffer().type = origin->getType();
|
||||
TensorUtils::setLinearLayout(newTensor.get());
|
||||
mTempInput.insert(std::make_pair(origin, newTensor.get()));
|
||||
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
|
||||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto tempTensor = newTensor.get();
|
||||
backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
|
||||
cache->pushCacheTensor(newTensor, origin, midFormat);
|
||||
#endif
|
||||
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
|
||||
}
|
||||
for (auto t : forRelease) {
|
||||
backend()->onReleaseBuffer(t, Backend::DYNAMIC);
|
||||
|
@ -175,7 +192,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
if (region->size[0] * region->size[1] * region->size[2] < thredHold) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
auto ptr = mTempInputCopy[0].first;
|
||||
auto tensorPtr = mTempInputCopy[0].first;
|
||||
int pos = -1;
|
||||
for (int i=0; i<3; ++i) {
|
||||
if (region->size[i] > 1) {
|
||||
|
@ -212,7 +229,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
|||
for (int v=pos+1; v<3; ++v) {
|
||||
cacheReg.size[v] = region->size[v];
|
||||
}
|
||||
mTempInputCopy.emplace_back(std::make_pair(ptr, cacheRegPtr.get()));
|
||||
mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
|
||||
mCacheRegions.emplace_back(cacheRegPtr);
|
||||
}
|
||||
}
|
||||
|
@ -318,7 +335,7 @@ void CPURaster::executeFaster(const std::vector<Tensor *> &inputs, const std::ve
|
|||
auto& iter = mFastBlit[u];
|
||||
auto& slice = iter.second;
|
||||
//Offset use byte
|
||||
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
|
||||
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
|
||||
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
||||
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
|
||||
for (int z=0; z<slice.size[0]; ++z) {
|
||||
|
@ -543,6 +560,11 @@ void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) {
|
|||
|
||||
|
||||
ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (nullptr != mTempOutput) {
|
||||
mOutputPtr = mTempOutput->host<void>();
|
||||
} else {
|
||||
mOutputPtr = outputs[0]->host<void>();
|
||||
}
|
||||
if (mFast) {
|
||||
executeFaster(____inputs, outputs);
|
||||
return NO_ERROR;
|
||||
|
@ -607,7 +629,7 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
|
|||
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
|
||||
auto& iter = mTempInputCopy[u];
|
||||
auto& slice = *(iter.second);
|
||||
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
|
||||
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
|
||||
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
||||
_blit(slice, bytes, srcPtr, dstPtr, proc);
|
||||
}
|
||||
|
@ -752,13 +774,12 @@ public:
|
|||
}
|
||||
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
|
||||
if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
|
||||
auto buffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
|
||||
if (nullptr == buffer.first) {
|
||||
mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
|
||||
if (mCacheBuffer.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
mCacheBuffer = (uint8_t*)buffer.first + buffer.second;
|
||||
mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
|
||||
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(buffer);
|
||||
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -887,7 +908,7 @@ public:
|
|||
auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
|
||||
auto dst = dstOrigin;
|
||||
if (cmd->fuse() >= 0) {
|
||||
dst = fuseBuffer;
|
||||
dst = fuseBuffer.ptr();
|
||||
}
|
||||
do {
|
||||
if (OpType_UnaryOp == op->type()) {
|
||||
|
@ -921,7 +942,7 @@ public:
|
|||
}
|
||||
} else {
|
||||
// Blit to cache
|
||||
auto srcCache = mCacheBuffer + mMaxCacheSize * tId;
|
||||
auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
|
||||
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
||||
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
|
||||
auto dstZ = dst + z * outputStride[0] * bytes;
|
||||
|
@ -978,7 +999,7 @@ public:
|
|||
}
|
||||
}
|
||||
} else {
|
||||
auto cache0 = mCacheBuffer + mMaxCacheSize * tId;
|
||||
auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
|
||||
auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
|
||||
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
||||
auto src0Z = src0 + z * stride1[0] * bytes;
|
||||
|
@ -1080,9 +1101,8 @@ private:
|
|||
const LoopParam* mLoop;
|
||||
std::vector<Tensor*> mStack;
|
||||
std::vector<ThreadContainer> mContainer;
|
||||
uint8_t* mCacheBuffer = nullptr;
|
||||
MemChunk mCacheBuffer, mFuseBuffer;
|
||||
int mMaxCacheSize = 0;
|
||||
uint8_t* mFuseBuffer = nullptr;
|
||||
int mMaxFuseBufferSize = 0;
|
||||
};
|
||||
|
||||
|
|
|
@ -28,8 +28,8 @@ public:
|
|||
void tensorConvert(Tensor* input, Tensor* output, int bytes);
|
||||
private:
|
||||
std::map<Tensor*, Tensor*> mTempInput;
|
||||
std::vector<std::pair<void*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
|
||||
std::vector<std::pair<void*, Tensor::InsideDescribe::Region>> mFastBlit;
|
||||
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
|
||||
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
|
||||
std::shared_ptr<Tensor> mTempOutput;
|
||||
void* mOutputPtr;
|
||||
bool mNeedZero = false;
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#include "CPUResizeCache.hpp"
|
||||
#include "../../core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const {
|
||||
auto iter = mFormatCache.find(std::make_pair(src, format));
|
||||
|
@ -14,5 +16,9 @@ void CPUResizeCache::pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor*
|
|||
void CPUResizeCache::reset() {
|
||||
mFormatCache.clear();
|
||||
}
|
||||
|
||||
void CPUResizeCache::release() {
|
||||
for (auto iter : mFormatCache) {
|
||||
TensorUtils::getDescribe(iter.second.get())->mem.reset(nullptr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -19,6 +19,7 @@ public:
|
|||
// Return cache tensor
|
||||
void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format);
|
||||
void reset();
|
||||
void release();
|
||||
private:
|
||||
std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
|
||||
};
|
||||
|
|
|
@ -647,7 +647,7 @@ L1Loop:
|
|||
ld1 {v4.8b}, [x1], #8 // src: k:6,7
|
||||
ld1 {v4.s}[2], [x1]
|
||||
|
||||
mov v9.4s, v16.4s
|
||||
mov v9.16b, v16.16b
|
||||
sxtl2 v6.8h, v4.16b
|
||||
|
||||
tbl v7.16b, {v2.16b, v3.16b}, v24.16b // src0
|
||||
|
|
|
@ -84,14 +84,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v1.8h
|
||||
scvtf v0.4s, v8.4s
|
||||
scvtf v1.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v0.4s, v12.4s
|
||||
fmla v9.4s, v1.4s, v13.4s
|
||||
scvtf v0.4s, v10.4s
|
||||
scvtf v1.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v0.4s, v12.4s
|
||||
fmla v11.4s, v1.4s, v13.4s
|
||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||
|
@ -153,14 +153,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v1.8h
|
||||
scvtf v0.4s, v8.4s
|
||||
scvtf v1.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v0.4s, v12.4s
|
||||
fmla v9.4s, v1.4s, v13.4s
|
||||
scvtf v0.4s, v10.4s
|
||||
scvtf v1.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v0.4s, v12.4s
|
||||
fmla v11.4s, v1.4s, v13.4s
|
||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||
|
@ -321,14 +321,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -405,14 +405,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -564,14 +564,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
|
||||
|
@ -616,14 +616,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.4s}, [x15], x11
|
||||
|
@ -721,7 +721,7 @@ blt E1
|
|||
mvni v9.4s, #6
|
||||
add v3.4s, v3.4s, v9.4s
|
||||
scvtf v3.4s, v3.4s
|
||||
mov v4.4s, v2.4s
|
||||
mov v4.16b, v2.16b
|
||||
fmla v4.4s, v3.4s, v1.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], x11
|
||||
|
@ -756,16 +756,16 @@ blt E1
|
|||
ld1 {v0.4s}, [x15], x11
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
ld1 {v1.4s}, [x15], x11
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
ld1 {v2.4s}, [x15], x11
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
ld1 {v3.4s}, [x15], x11
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
@ -810,7 +810,7 @@ blt E1
|
|||
mvni v9.4s, #6
|
||||
add v3.4s, v3.4s, v9.4s
|
||||
scvtf v3.4s, v3.4s
|
||||
mov v4.4s, v2.4s
|
||||
mov v4.16b, v2.16b
|
||||
fmla v4.4s, v3.4s, v1.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], x11
|
||||
|
@ -840,14 +840,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
||||
|
@ -953,14 +953,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.s}[0], [x15], x11
|
||||
|
@ -989,14 +989,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.s}[0], [x15], x11
|
||||
|
@ -1059,14 +1059,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -1102,14 +1102,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v15.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v15.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v15.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v15.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
|
|
@ -74,14 +74,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v1.8h
|
||||
scvtf v0.4s, v8.4s
|
||||
scvtf v1.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v0.4s, v12.4s
|
||||
fmla v9.4s, v1.4s, v13.4s
|
||||
scvtf v0.4s, v10.4s
|
||||
scvtf v1.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v0.4s, v12.4s
|
||||
fmla v11.4s, v1.4s, v13.4s
|
||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||
|
@ -137,14 +137,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v1.8h
|
||||
scvtf v0.4s, v8.4s
|
||||
scvtf v1.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v0.4s, v12.4s
|
||||
fmla v9.4s, v1.4s, v13.4s
|
||||
scvtf v0.4s, v10.4s
|
||||
scvtf v1.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v0.4s, v12.4s
|
||||
fmla v11.4s, v1.4s, v13.4s
|
||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||
|
@ -294,14 +294,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -371,14 +371,14 @@ LoopE8:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -520,14 +520,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
|
||||
|
@ -567,14 +567,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.4s}, [x15], x11
|
||||
|
@ -669,16 +669,16 @@ blt E1
|
|||
ld1 {v0.4s}, [x15], x11
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
ld1 {v1.4s}, [x15], x11
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
ld1 {v2.4s}, [x15], x11
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
ld1 {v3.4s}, [x15], x11
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
@ -717,14 +717,14 @@ blt E1
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
||||
|
@ -819,14 +819,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.s}[0], [x15], x11
|
||||
|
@ -849,14 +849,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v26.4s
|
||||
mov v9.4s, v27.4s
|
||||
mov v8.16b, v26.16b
|
||||
mov v9.16b, v27.16b
|
||||
fmla v8.4s, v12.4s, v24.4s
|
||||
fmla v9.4s, v13.4s, v25.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v26.4s
|
||||
mov v11.4s, v27.4s
|
||||
mov v10.16b, v26.16b
|
||||
mov v11.16b, v27.16b
|
||||
fmla v10.4s, v12.4s, v24.4s
|
||||
fmla v11.4s, v13.4s, v25.4s
|
||||
ld1 {v0.s}[0], [x15], x11
|
||||
|
@ -909,14 +909,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v14.4s
|
||||
mov v9.4s, v14.4s
|
||||
mov v8.16b, v14.16b
|
||||
mov v9.16b, v14.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v14.4s
|
||||
mov v11.4s, v14.4s
|
||||
mov v10.16b, v14.16b
|
||||
mov v11.16b, v14.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
@ -944,14 +944,14 @@ LoopE1:
|
|||
sxtl2 v11.4s, v12.8h
|
||||
scvtf v12.4s, v8.4s
|
||||
scvtf v13.4s, v9.4s
|
||||
mov v8.4s, v15.4s
|
||||
mov v9.4s, v15.4s
|
||||
mov v8.16b, v15.16b
|
||||
mov v9.16b, v15.16b
|
||||
fmla v8.4s, v12.4s, v4.4s
|
||||
fmla v9.4s, v13.4s, v4.4s
|
||||
scvtf v12.4s, v10.4s
|
||||
scvtf v13.4s, v11.4s
|
||||
mov v10.4s, v15.4s
|
||||
mov v11.4s, v15.4s
|
||||
mov v10.16b, v15.16b
|
||||
mov v11.16b, v15.16b
|
||||
fmla v10.4s, v12.4s, v4.4s
|
||||
fmla v11.4s, v13.4s, v4.4s
|
||||
|
||||
|
|
|
@ -68,9 +68,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -108,9 +108,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -164,9 +164,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -204,9 +204,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -386,8 +386,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
@ -428,8 +428,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
@ -483,8 +483,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
||||
|
@ -520,8 +520,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
|
|
@ -59,9 +59,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -99,9 +99,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -145,9 +145,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -185,9 +185,9 @@ LoopH:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v0.4s, v1.4s
|
||||
scvtf v1.4s, v2.4s
|
||||
mov v2.4s, v7.4s
|
||||
mov v2.16b, v7.16b
|
||||
fmla v2.4s, v1.4s, v5.4s
|
||||
mov v1.4s, v6.4s
|
||||
mov v1.16b, v6.16b
|
||||
fmla v1.4s, v0.4s, v4.4s
|
||||
|
||||
ld1 {v0.4s}, [x15], #16
|
||||
|
@ -357,8 +357,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
@ -399,8 +399,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
@ -448,8 +448,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
||||
|
@ -485,8 +485,8 @@ LoopHRemain:
|
|||
sxtl2 v2.4s, v0.8h
|
||||
scvtf v1.4s, v1.4s
|
||||
scvtf v2.4s, v2.4s
|
||||
mov v3.4s, v21.4s
|
||||
mov v4.4s, v21.4s
|
||||
mov v3.16b, v21.16b
|
||||
mov v4.16b, v21.16b
|
||||
fmla v3.4s, v1.4s, v20.4s
|
||||
fmla v4.4s, v2.4s, v20.4s
|
||||
|
||||
|
|
|
@ -187,7 +187,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
|
|||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
if (mBlitInfo.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
|
@ -236,7 +236,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
|||
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
int32_t info[4];
|
||||
|
|
|
@ -31,7 +31,7 @@ protected:
|
|||
std::shared_ptr<Tensor> mTempIm2ColBuffer;
|
||||
std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
|
||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
MemChunk mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
int mIm2ColCount;
|
||||
};
|
||||
|
|
|
@ -193,8 +193,9 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
|
|||
}
|
||||
for (auto& unit : mUnits) {
|
||||
int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
|
||||
auto srcData = input->host<float>() + (sy * iw + sx) * UNIT;
|
||||
unit.input.reset(Tensor::create<float>({batch, ic, ih - sy, iw - sx}, srcData, Tensor::CAFFE_C4));
|
||||
auto srcChunk = TensorUtils::getDescribe(input)->mem->chunk() + (sy * iw + sx) * UNIT;
|
||||
unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
|
||||
TensorUtils::getDescribe(unit.input.get())->mem.reset(new CPUMemObj(nullptr, srcChunk, 0));
|
||||
for (int i = 0; i < input->dimensions(); ++i) {
|
||||
unit.input->setStride(i, input->stride(i));
|
||||
}
|
||||
|
@ -296,6 +297,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
|
||||
std::vector<Tensor*> tmp_outputs;
|
||||
for (auto& unit : mUnits) {
|
||||
unit.input->buffer().host = TensorUtils::getDescribe(unit.input.get())->mem->chunk().ptr();
|
||||
auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
|
||||
if (ret != NO_ERROR) {
|
||||
return ret;
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "ConvOpt.h"
|
||||
#include "core/Macro.h"
|
||||
#include "CommonOptFunction.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||
|
@ -88,8 +89,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
auto matrixSizeE = output->height() * output->width() * input->batch();
|
||||
auto outputPlane = output->height() * output->width();
|
||||
mUnits.clear();
|
||||
auto inputPtr = input->host<uint8_t>();
|
||||
auto outputPtr = output->host<uint8_t>();
|
||||
auto inputPtr = TensorUtils::getDescribe(input)->mem->chunk();
|
||||
auto outputPtr = TensorUtils::getDescribe(output)->mem->chunk();
|
||||
|
||||
std::shared_ptr<char> __autoFunction;
|
||||
auto padY = mPadY;
|
||||
auto padX = mPadX;
|
||||
|
@ -124,9 +126,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
int l = ic;
|
||||
int h = oc;
|
||||
auto aPtr = inputPtr + core->pack * planeStart * bytes;
|
||||
auto bPtr = weightTensor->host<uint8_t>();
|
||||
auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
|
||||
auto cPtr = outputPtr + core->pack * planeStart * bytes;
|
||||
auto biasPtr = mResource->mBias->host<uint8_t>();
|
||||
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
|
||||
memoryPool->beginGroup();
|
||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||
if (NO_ERROR != code) {
|
||||
|
@ -168,9 +170,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
|||
int l = ic;
|
||||
int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
|
||||
auto aPtr = inputPtr;
|
||||
auto bPtr = mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes;
|
||||
auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
|
||||
auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
|
||||
auto biasPtr = mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes;
|
||||
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
|
||||
memoryPool->beginGroup();
|
||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||
if (NO_ERROR != code) {
|
||||
|
|
|
@ -413,7 +413,6 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
|
|||
if (!res) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
|
||||
for (auto& unit : mComputeUnits) {
|
||||
backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
|
||||
if (unit.winogradInfo.open) {
|
||||
|
@ -469,6 +468,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
|
|||
auto srcOrigin = input->host<float>();
|
||||
auto dstOrigin = output->host<float>();
|
||||
|
||||
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
|
||||
::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float));
|
||||
auto threadFunction = [&](int threadId) {
|
||||
auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);
|
||||
|
|
|
@ -440,10 +440,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
int LRoundupC4 = UP_DIV(LRoundup, unit);
|
||||
auto outputChannel = output->channel();
|
||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
|
||||
const float *biasPtr = nullptr;
|
||||
if (inputs.size() > 2) {
|
||||
bias = inputs[2];
|
||||
biasPtr = bias->host<float>();
|
||||
}
|
||||
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
||||
|
||||
|
@ -467,7 +465,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
if (nullptr == tempPtr.first) {
|
||||
if (tempPtr.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||
|
@ -483,10 +481,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n",
|
||||
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes);
|
||||
#endif
|
||||
|
||||
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
|
||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
||||
0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto srcPtr = (float const **)(tempPtr.ptr() + 0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||
auto weightPtr = weight->host<uint8_t>();
|
||||
|
||||
|
@ -614,10 +611,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
|||
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes);
|
||||
}
|
||||
#endif
|
||||
|
||||
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
||||
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||
auto weightPtr = weight->host<float>();
|
||||
int32_t info[4];
|
||||
|
|
|
@ -91,7 +91,7 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
|
|||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
if (mBlitInfo.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
|
@ -147,7 +147,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||
info[2] = DST_XUNIT;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
|
||||
|
|
|
@ -31,7 +31,7 @@ protected:
|
|||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
MemChunk mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
|
|
@ -130,7 +130,7 @@ ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, cons
|
|||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
if (mBlitInfo.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
|
@ -199,7 +199,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
|
|||
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
|
||||
auto threadFunction = [&](int tId) {
|
||||
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
int32_t info[4];
|
||||
|
|
|
@ -40,7 +40,7 @@ private:
|
|||
std::vector<float> mPostParameters;
|
||||
// mFakeBias used by GemmKernel
|
||||
std::shared_ptr<Tensor> mFakeBias;
|
||||
std::pair<void*, int> mBlitInfo;
|
||||
MemChunk mBlitInfo;
|
||||
std::pair<size_t, size_t> mBlitInfoStride;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
|
|
@ -142,6 +142,55 @@ static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** s
|
|||
}
|
||||
}
|
||||
|
||||
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||
#ifdef MNN_USE_SSE
|
||||
uint8_t* srcPtr = (uint8_t*)src;
|
||||
uint8_t* dstPtr = (uint8_t*)dst;
|
||||
int offset = 128;
|
||||
#else
|
||||
const int8_t* srcPtr = src;
|
||||
int8_t* dstPtr = dst;
|
||||
int offset = 0;
|
||||
#endif
|
||||
int inpZero = static_cast<int>(params->inputZeroPoint[0]);
|
||||
int outZero = static_cast<int>(params->outputZeroPoint[0]);
|
||||
float inpScale = params->inputScale[0];
|
||||
float outScale = params->outputScale[0];
|
||||
float sum = 0.f;
|
||||
int max_ = static_cast<int>(params->maxValue);
|
||||
int min_ = static_cast<int>(params->minValue);
|
||||
for (int j = 0; j < size; ++j) {
|
||||
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||
sum += fx;
|
||||
}
|
||||
float mean = sum / size;
|
||||
float square_sum = 0.f;
|
||||
for (int j = 0; j < size; ++j) {
|
||||
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||
square_sum += (fx - mean) * (fx - mean);
|
||||
}
|
||||
float variable = square_sum / size;
|
||||
variable = 1.f / std::sqrt(variable + epsilon);
|
||||
|
||||
if (gamma && beta) {
|
||||
for (int j = 0; j < size; ++j) {
|
||||
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||
float fy = (fx - mean) * variable * gamma[j] + beta[j];
|
||||
int sy = fy * outScale + outZero;
|
||||
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
||||
dstPtr[j] = sy + offset;
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < size; ++j) {
|
||||
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||
float fy = (fx - mean) * variable;
|
||||
int sy = roundf(fy * outScale) + outZero;
|
||||
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
||||
dstPtr[j] = sy + offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef MNN_USE_NEON
|
||||
|
||||
void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
|
||||
|
@ -2057,6 +2106,9 @@ void MNNCoreInt8FunctionInit() {
|
|||
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
|
||||
gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
|
||||
|
||||
// Norm
|
||||
gCoreFunc->MNNNormInt8 = MNNNormInt8;
|
||||
|
||||
#if defined(__aarch64__)
|
||||
auto core = MNNGetCoreFunctions();
|
||||
if (core->supportSDot) {
|
||||
|
|
|
@ -68,6 +68,7 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
|||
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
||||
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
||||
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
|
||||
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -103,6 +104,8 @@ struct CoreInt8Functions {
|
|||
|
||||
void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
|
||||
|
||||
// Norm
|
||||
void (*MNNNormInt8)(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||
};
|
||||
void MNNCoreInt8FunctionInit();
|
||||
CoreInt8Functions* MNNGetInt8CoreFunctions();
|
||||
|
|
|
@ -144,7 +144,7 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
|
|||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||
if (nullptr == mBlitInfo.first) {
|
||||
if (mBlitInfo.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
bufferAlloc->free(mBlitInfo);
|
||||
|
@ -193,7 +193,7 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
|
|||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||
info[2] = (int)mSparseQuantParam.eP;
|
||||
info[3] = mIm2ColParamter.strideX;
|
||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
||||
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||
|
||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||
|
|
|
@ -309,7 +309,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
|
||||
if (nullptr == tempPtr.first) {
|
||||
if (tempPtr.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||
|
@ -320,8 +320,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
|||
|
||||
mFunction.second = [=](int tId) {
|
||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
||||
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||
|
||||
int32_t info[4];
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include "core/AutoStorage.h"
|
||||
#include "core/Macro.h"
|
||||
#include "core/Concurrency.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
//#define MNN_OPEN_TIME_TRACE
|
||||
#include <MNN/AutoTime.hpp>
|
||||
#include "math/Vec.hpp"
|
||||
|
@ -28,15 +29,15 @@ public:
|
|||
mAllocator = allocator;
|
||||
}
|
||||
~ AutoMemory() {
|
||||
if (nullptr != mContent.first) {
|
||||
if (!mContent.invalid()) {
|
||||
mAllocator->free(mContent);
|
||||
}
|
||||
}
|
||||
const std::pair<void*, int>& get() const {
|
||||
const MemChunk& get() const {
|
||||
return mContent;
|
||||
}
|
||||
private:
|
||||
std::pair<void*, int> mContent;
|
||||
MemChunk mContent;
|
||||
BufferAllocator* mAllocator;
|
||||
};
|
||||
|
||||
|
@ -62,15 +63,15 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
|
|||
auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes;
|
||||
MNN_ASSERT(bExtraStride >= 0);
|
||||
auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes);
|
||||
if (nullptr == tileBufferBasic.first) {
|
||||
if (tileBufferBasic.invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
auto tileHostOrigin = (uint8_t*)tileBufferBasic.first + tileBufferBasic.second;
|
||||
|
||||
int unitNumber = e / eP;
|
||||
int xCount = e - unitNumber * eP;
|
||||
auto eReal = aStride / core->bytes / core->pack;
|
||||
mFunctions.emplace_back(
|
||||
std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileHostOrigin, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
|
||||
std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
size_t parameters[6];
|
||||
parameters[0] = xCount * core->bytes;
|
||||
|
@ -79,17 +80,17 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
|
|||
parameters[3] = cStride;
|
||||
parameters[4] = 0;
|
||||
parameters[5] = bExtraStride;
|
||||
auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes;
|
||||
auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes;
|
||||
const float* postParametersPtr = nullptr;
|
||||
if (!active.empty()) {
|
||||
postParametersPtr = active.data();
|
||||
}
|
||||
auto aHost = mStack[AT.stackIndex] + AT.offsetBytes;
|
||||
auto bHost = mStack[BT.stackIndex] + BT.offsetBytes;
|
||||
auto cHost = mStack[CT.stackIndex] + CT.offsetBytes;
|
||||
auto aHost = mStack[AT.stackIndex].ptr() + AT.offsetBytes;
|
||||
auto bHost = mStack[BT.stackIndex].ptr() + BT.offsetBytes;
|
||||
auto cHost = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||
const uint8_t* biasPtr = nullptr;
|
||||
if (-1 != COT.stackIndex) {
|
||||
biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
||||
biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||
}
|
||||
auto packUnit = core->bytes * core->pack;
|
||||
int32_t info[4];
|
||||
|
@ -166,7 +167,7 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
|||
CTemp.stackIndex = (int)mStack.size();
|
||||
CTemp.offsetBytes = 0;
|
||||
CTemp.lineStrideBytes = e * core->bytes * core->pack;
|
||||
mStack.emplace_back((uint8_t*)CAddr.get().first + CAddr.get().second);
|
||||
mStack.emplace_back(CAddr.get());
|
||||
|
||||
MatrixInfo Empty;
|
||||
Empty.stackIndex = -1;
|
||||
|
@ -197,8 +198,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
|||
}
|
||||
// Add CTemp to C
|
||||
auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) {
|
||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
||||
auto xAddr = mStack[CTemp.stackIndex] + CTemp.offsetBytes;
|
||||
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||
auto xAddr = mStack[CTemp.stackIndex].ptr() + CTemp.offsetBytes;
|
||||
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
||||
|
@ -206,10 +207,10 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
|||
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
||||
if (1 == numberThread) {
|
||||
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
|
||||
auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
|
||||
auto width = e;
|
||||
auto height = cHeight;
|
||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
||||
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data());
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(postFunction, 1));
|
||||
|
@ -217,8 +218,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
|||
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||
auto width = e;
|
||||
auto height = cHeight;
|
||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
||||
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
||||
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||
auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||
for (int y = tId; y < height; y+=numberThread) {
|
||||
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
||||
}
|
||||
|
@ -278,19 +279,19 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
auto maxlH = std::max(lSub, hSub);
|
||||
AutoMemory YAddr(hSub * lSub * core->bytes, allocator);
|
||||
AutoMemory XAddr(maxlH * eSub * core->bytes, allocator);
|
||||
if (nullptr == XAddr.get().first || nullptr == YAddr.get().first) {
|
||||
if (XAddr.get().invalid() || YAddr.get().invalid()) {
|
||||
return OUT_OF_MEMORY;
|
||||
}
|
||||
MatrixInfo Y;
|
||||
Y.stackIndex = (int)mStack.size();
|
||||
mStack.emplace_back((uint8_t*)YAddr.get().first + YAddr.get().second);
|
||||
mStack.emplace_back(YAddr.get());
|
||||
Y.offsetBytes = 0;
|
||||
Y.lineStrideBytes = lSub * core->bytes * hP;
|
||||
MatrixInfo X;
|
||||
X.stackIndex = (int)mStack.size();
|
||||
X.offsetBytes = 0;
|
||||
X.lineStrideBytes = eSub * core->bytes * core->pack;
|
||||
mStack.emplace_back((uint8_t*)XAddr.get().first + XAddr.get().second);
|
||||
mStack.emplace_back(XAddr.get());
|
||||
|
||||
MatrixInfo CX;
|
||||
CX.stackIndex = X.stackIndex;
|
||||
|
@ -327,12 +328,12 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
{
|
||||
// S3=A11-A21, T3=B22-B12, P7=S3*T3
|
||||
auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) {
|
||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
||||
auto a11Ptr = mStack[a11.stackIndex] + a11.offsetBytes;
|
||||
auto a21Ptr = mStack[a21.stackIndex] + a21.offsetBytes;
|
||||
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||
auto a11Ptr = mStack[a11.stackIndex].ptr() + a11.offsetBytes;
|
||||
auto a21Ptr = mStack[a21.stackIndex].ptr() + a21.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {});
|
||||
|
@ -343,8 +344,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
{
|
||||
// S1=A21+A22, T1=B12-B11, P5=S1T1
|
||||
auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
||||
MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex] + X.offsetBytes, mStack[a21.stackIndex] + a21.offsetBytes, mStack[a22.stackIndex] + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex] + Y.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, mStack[b11.stackIndex] + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
|
||||
MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex].ptr() + X.offsetBytes, mStack[a21.stackIndex].ptr() + a21.offsetBytes, mStack[a22.stackIndex].ptr() + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex].ptr() + Y.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, mStack[b11.stackIndex].ptr() + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {});
|
||||
|
@ -355,10 +356,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
{
|
||||
// S2=S1-A11, T2=B22-T1, P6=S2T2
|
||||
auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex] + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
|
||||
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex].ptr() + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {});
|
||||
|
@ -369,8 +370,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
{
|
||||
// S4=A12-S2, P3=S4*B22, P1=A11*B11
|
||||
auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) {
|
||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex] + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
|
||||
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex].ptr() + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {});
|
||||
|
@ -387,10 +388,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
// U5=U4+P3, T4=T2-B21, P4=A22*T4
|
||||
auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) {
|
||||
for (int y = tId; y < cHeight; y+=numberThread) {
|
||||
core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex] + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex] + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex] + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex] + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex] + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
|
||||
core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex].ptr() + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex].ptr() + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex].ptr() + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex].ptr() + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex].ptr() + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
|
||||
}
|
||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex] + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
|
||||
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex].ptr() + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {});
|
||||
|
@ -402,8 +403,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
// U6=U3-P4, P2=A12*B21, U1=P1+P2
|
||||
auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) {
|
||||
auto cw = eSub;
|
||||
auto c21Addr = mStack[c21.stackIndex] + c21.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex] + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
|
||||
auto c21Addr = mStack[c21.stackIndex].ptr() + c21.offsetBytes;
|
||||
MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex].ptr() + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f0, numberThread));
|
||||
auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {});
|
||||
|
@ -412,18 +413,18 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
}
|
||||
auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) {
|
||||
auto cw = eSub;
|
||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
||||
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core);
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
||||
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
||||
if (1 == numberThread) {
|
||||
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
|
||||
auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
|
||||
auto width = eSub * 2;
|
||||
auto height = cHeight * 2;
|
||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
||||
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data());
|
||||
};
|
||||
mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
|
||||
|
@ -431,8 +432,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
|||
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||
auto width = eSub * 2;
|
||||
auto height = cHeight * 2;
|
||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
||||
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
||||
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||
auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||
for (int y = tId; y < height; y+=numberThread) {
|
||||
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
||||
}
|
||||
|
@ -496,25 +497,25 @@ ErrorCode StrassenMatrixComputor::onEncode(const std::vector<Tensor*>& inputs, c
|
|||
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
||||
int bs = UP_DIV(l, lP) * lP * hP;
|
||||
int cs = C->stride(0);
|
||||
uint8_t* bias = nullptr;
|
||||
MemChunk bias;
|
||||
bool useBias = false;
|
||||
if (inputs.size() > 2) {
|
||||
bias = inputs[2]->host<uint8_t>();
|
||||
bias = TensorUtils::getDescribe(inputs[2])->mem->chunk();
|
||||
useBias = true;
|
||||
}
|
||||
return onEncode(e, l, h, as, bs, cs, A->host<uint8_t>(), B->host<uint8_t>(), C->host<uint8_t>(), useBias, bias, postParameters);
|
||||
return onEncode(e, l, h, as, bs, cs, TensorUtils::getDescribe(A)->mem->chunk(), TensorUtils::getDescribe(B)->mem->chunk(), TensorUtils::getDescribe(C)->mem->chunk(), useBias, bias, postParameters);
|
||||
}
|
||||
|
||||
ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias, const std::vector<float>& postParameters) {
|
||||
ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias, const std::vector<float>& postParameters) {
|
||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||
MatrixInfo a,b,c,bias;
|
||||
bias.stackIndex = -1;
|
||||
mFunctions.clear();
|
||||
mStack = {(uint8_t*)AT, (uint8_t*)BT, CT};
|
||||
mStack = {AT, BT, CT};
|
||||
if (useBias) {
|
||||
bias.stackIndex = 3;
|
||||
bias.offsetBytes = 0;
|
||||
mStack.emplace_back((uint8_t*)Bias);
|
||||
mStack.emplace_back(Bias);
|
||||
}
|
||||
a.stackIndex = 0;
|
||||
a.lineStrideBytes = as * core->bytes;
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#define StrassenMatmulComputor_hpp
|
||||
|
||||
#include <functional>
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "core/Backend.hpp"
|
||||
namespace MNN {
|
||||
/**
|
||||
|
@ -53,7 +54,8 @@ public:
|
|||
*/
|
||||
ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0);
|
||||
|
||||
ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
|
||||
ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias = MemChunk(), const std::vector<float>& postParameters = {});
|
||||
// ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
|
||||
|
||||
void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr);
|
||||
|
||||
|
@ -79,7 +81,7 @@ private:
|
|||
|
||||
Backend* mBackend;
|
||||
|
||||
std::vector<uint8_t*> mStack;
|
||||
std::vector<MemChunk> mStack;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
|
|
|
@ -124,6 +124,7 @@ void MNNInt8FunctionInit() {
|
|||
auto core = MNN::MNNGetInt8CoreFunctions();
|
||||
core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
|
||||
core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
|
||||
core->MNNNormInt8 = _SSE_MNNNormInt8;
|
||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
|
||||
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
|
||||
|
|
|
@ -75,6 +75,7 @@ void _AVX_WinogradInit(void* functions);
|
|||
|
||||
void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters);
|
||||
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||
|
||||
void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP);
|
||||
void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
|
||||
|
|
|
@ -754,4 +754,7 @@ void _AVX_MNNInt8FunctionInit(void* functions) {
|
|||
|
||||
// conv depthwise
|
||||
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
|
||||
|
||||
// Norm
|
||||
gAVX2CoreInt8Functions->MNNNormInt8 = _AVX_MNNNormInt8;
|
||||
}
|
||||
|
|
|
@ -202,7 +202,7 @@ void _AVX_MNNSoftmax(float* dest, const float* source, size_t size) {
|
|||
|
||||
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
||||
float tmpfloat8[8];
|
||||
int count = size / 8;
|
||||
int count = static_cast<int32_t>(size / 8);
|
||||
int remain = count * 8;
|
||||
// step 1: get sum
|
||||
float sum = 0.f;
|
||||
|
@ -264,3 +264,78 @@ void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||
float tmpfloat8[8];
|
||||
int count = static_cast<int32_t>(size / 8);
|
||||
int remain = count * 8;
|
||||
std::vector<float> inpf(size);
|
||||
std::vector<float> outf(size);
|
||||
std::vector<float> inpScale(4, params->inputScale[0]);
|
||||
std::vector<float> outScale(4, params->outputScale[0]);
|
||||
float* srcf = inpf.data();
|
||||
float* dstf = outf.data();
|
||||
// step 0: Int8 -> Float
|
||||
_AVX_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
|
||||
// step 1: get sum
|
||||
float sum = 0.f;
|
||||
if (count > 0) {
|
||||
auto sumVal = _mm256_set1_ps(0.f);
|
||||
for (int i = 0; i < count; i++) {
|
||||
sumVal = _mm256_add_ps(sumVal, _mm256_loadu_ps(srcf + i * 8));
|
||||
}
|
||||
_mm256_storeu_ps(tmpfloat8, sumVal);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
sum += tmpfloat8[i];
|
||||
}
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
sum += srcf[i];
|
||||
}
|
||||
// step 2: get square_sum
|
||||
float mean = sum / size;
|
||||
float square_sum = 0.f;
|
||||
auto meanVal = _mm256_set1_ps(mean);
|
||||
if (count > 0) {
|
||||
auto sumVal = _mm256_set1_ps(0.f);
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||
sumVal = _mm256_add_ps(sumVal, _mm256_mul_ps(x, x));
|
||||
}
|
||||
_mm256_storeu_ps(tmpfloat8, sumVal);
|
||||
for (int i = 0; i < 8; i++) {
|
||||
square_sum += tmpfloat8[i];
|
||||
}
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
float x = (srcf[i] - mean);
|
||||
square_sum += x * x;
|
||||
}
|
||||
// step 3: get result
|
||||
float variable = square_sum / size;
|
||||
variable = 1.f / sqrt(variable + epsilon);
|
||||
auto variableVal = _mm256_set1_ps(variable);
|
||||
if (gamma && beta) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||
auto g = _mm256_loadu_ps(gamma + i * 8);
|
||||
auto b = _mm256_loadu_ps(beta + i * 8);
|
||||
auto y = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(x, g), variableVal), b);
|
||||
_mm256_storeu_ps(dstf + i * 8, y);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
dstf[i] = (srcf[i] - mean) * gamma[i] * variable + beta[i] ;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||
auto y = _mm256_mul_ps(x, variableVal);
|
||||
_mm256_storeu_ps(dstf + i * 8, y);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
dstf[i] = (srcf[i] - mean) * variable;
|
||||
}
|
||||
}
|
||||
// step 4: Float -> Int8
|
||||
_AVX_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
|
||||
}
|
|
@ -79,6 +79,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
|
|||
void _SSE_ExtraInit(void* functions);
|
||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
|
||||
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||
|
||||
/* Image process functions */
|
||||
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
|
||||
|
|
|
@ -58,7 +58,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
|
|||
|
||||
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
|
||||
float tmpfloat4[4];
|
||||
int count = size / 4;
|
||||
int count = static_cast<int32_t>(size / 4);
|
||||
int remain = count * 4;
|
||||
// step 1: get maxValue
|
||||
float maxValue = source[0];
|
||||
|
@ -212,7 +212,7 @@ void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) {
|
|||
|
||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
||||
float tmpfloat4[4];
|
||||
int count = size / 4;
|
||||
int count = static_cast<int32_t>(size / 4);
|
||||
int remain = count * 4;
|
||||
// step 1: get sum
|
||||
float sum = 0.f;
|
||||
|
@ -270,3 +270,74 @@ void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||
float tmpfloat4[4];
|
||||
int count = static_cast<int32_t>(size / 4);
|
||||
int remain = count * 4;
|
||||
float sum = 0.f;
|
||||
std::vector<float> inpf(size);
|
||||
std::vector<float> outf(size);
|
||||
std::vector<float> inpScale(4, params->inputScale[0]);
|
||||
std::vector<float> outScale(4, params->outputScale[0]);
|
||||
float* srcf = inpf.data();
|
||||
float* dstf = outf.data();
|
||||
// step 0: Int8 -> Float
|
||||
_SSE_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
|
||||
// step 1: get sum
|
||||
if (count > 0) {
|
||||
auto sumVal = _mm_set1_ps(0.f);
|
||||
for (int i = 0; i < count; i++) {
|
||||
sumVal = _mm_add_ps(sumVal, _mm_loadu_ps(srcf + i * 4));
|
||||
}
|
||||
_mm_storeu_ps(tmpfloat4, sumVal);
|
||||
sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
sum += srcf[i];
|
||||
}
|
||||
// step 2: get square_sum
|
||||
float mean = sum / size;
|
||||
float square_sum = 0.f;
|
||||
auto meanVal = _mm_set1_ps(mean);
|
||||
if (count > 0) {
|
||||
auto sumVal = _mm_set1_ps(0.f);
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||
sumVal = _mm_add_ps(sumVal, _mm_mul_ps(x, x));
|
||||
}
|
||||
_mm_storeu_ps(tmpfloat4, sumVal);
|
||||
square_sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
float x = (srcf[i] - mean);
|
||||
square_sum += x * x;
|
||||
}
|
||||
// step 3: get result
|
||||
float variable = square_sum / size;
|
||||
variable = 1.f / sqrt(variable + epsilon);
|
||||
auto variableVal = _mm_set1_ps(variable);
|
||||
if (gamma && beta) {
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||
auto g = _mm_loadu_ps(gamma + i * 4);
|
||||
auto b = _mm_loadu_ps(beta + i * 4);
|
||||
auto y = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(x, g), variableVal), b);
|
||||
_mm_storeu_ps(dstf + i * 4, y);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
dstf[i] = (src[i] - mean) * gamma[i] * variable + beta[i] ;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < count; i++) {
|
||||
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||
auto y = _mm_mul_ps(x, variableVal);
|
||||
_mm_storeu_ps(dstf + i * 4, y);
|
||||
}
|
||||
for (int i = remain; i < size; i++) {
|
||||
dstf[i] = (srcf[i] - mean) * variable;
|
||||
}
|
||||
}
|
||||
// step 4: Float -> Int8
|
||||
_SSE_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
|
||||
}
|
||||
|
|
|
@ -37,10 +37,10 @@ public:
|
|||
// Do nothing
|
||||
}
|
||||
virtual ~ CUDARuntimeAllocator() = default;
|
||||
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
|
||||
return std::make_pair(mRuntime->alloc(size), 0);
|
||||
virtual MemChunk onAlloc(size_t size, size_t align) override {
|
||||
return MemChunk(mRuntime->alloc(size), 0);
|
||||
}
|
||||
virtual void onRelease(std::pair<void*, size_t> ptr) override {
|
||||
virtual void onRelease(MemChunk ptr) override {
|
||||
mRuntime->free(ptr.first);
|
||||
}
|
||||
private:
|
||||
|
@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
|
|||
return;
|
||||
}
|
||||
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
|
||||
mBufferPool.reset(new BufferAllocator(allocator));
|
||||
mBufferPool.reset(new EagerBufferAllocator(allocator));
|
||||
}
|
||||
mDefaultPrecision = precision;
|
||||
}
|
||||
|
@ -103,7 +103,7 @@ CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
|
|||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("cuda backend create\n");
|
||||
#endif
|
||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
||||
mBufferPool.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
||||
mStaticBufferPool = st;
|
||||
mCUDARuntime = rt;
|
||||
mUseFp16AsFp32 = (precision == 2);
|
||||
|
@ -139,16 +139,19 @@ int CUDABackend::getPrecision() const {
|
|||
|
||||
class CUDAMemObj : public Backend::MemObj {
|
||||
public:
|
||||
CUDAMemObj(BufferAllocator* allocator, std::pair<void*, int> points) {
|
||||
CUDAMemObj(BufferAllocator* allocator, MemChunk points) {
|
||||
mPoint = std::move(points);
|
||||
mAllocator = allocator;
|
||||
}
|
||||
virtual ~ CUDAMemObj() {
|
||||
mAllocator->free(mPoint);
|
||||
}
|
||||
MemChunk chunk() override {
|
||||
return mPoint;
|
||||
}
|
||||
private:
|
||||
BufferAllocator* mAllocator;
|
||||
std::pair<void*, int> mPoint;
|
||||
MemChunk mPoint;
|
||||
};
|
||||
int CUDABackend::getBytes(const Tensor* tensor) const {
|
||||
auto bytes = tensor->getType().bytes();
|
||||
|
@ -176,7 +179,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
|
|||
auto bytes = getBytes(nativeTensor);
|
||||
size_t mallocSize = realSize(nativeTensor) * bytes;
|
||||
|
||||
std::pair<void*, int> buffer;
|
||||
MemChunk buffer;
|
||||
if (storageType == DYNAMIC_SEPERATE) {
|
||||
buffer = mBufferPool->alloc(mallocSize, true);
|
||||
allocator = mBufferPool.get();
|
||||
|
@ -191,7 +194,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
|
|||
if(nullptr == buffer.first) {
|
||||
return nullptr;
|
||||
};
|
||||
auto host = (uint8_t*)buffer.first + buffer.second;
|
||||
auto host = buffer.ptr();
|
||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
|
||||
auto des = TensorUtils::getDescribe(nativeTensor);
|
||||
des->extra.offset = buffer.second;
|
||||
|
@ -380,7 +383,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1);
|
||||
MNN_ASSERT(srcDevice || dstDevice);
|
||||
uint8_t* srcPtr = nullptr;
|
||||
std::pair<void*, int> tempSrcStorage;
|
||||
MemChunk tempSrcStorage;
|
||||
auto bytes = getBytes(srcTensor);
|
||||
auto type = srcTensor->getType();
|
||||
|
||||
|
@ -434,18 +437,18 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
if (!srcDevice) {
|
||||
auto cpuSize = srcTensor->size();
|
||||
tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
|
||||
srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
|
||||
srcPtr = tempSrcStorage.ptr();
|
||||
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
|
||||
true);
|
||||
} else {
|
||||
srcPtr = (uint8_t*)srcTensor->deviceId();
|
||||
}
|
||||
uint8_t* dstPtr = nullptr;
|
||||
std::pair<void*, int> tempDstStorage;
|
||||
MemChunk tempDstStorage;
|
||||
if (!dstDevice) {
|
||||
auto cpuSize = dstTensor->size();
|
||||
tempDstStorage = mStaticBufferPool->alloc(cpuSize);
|
||||
dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
|
||||
dstPtr = tempDstStorage.ptr();
|
||||
} else {
|
||||
dstPtr = (uint8_t*)dstTensor->deviceId();
|
||||
}
|
||||
|
@ -462,7 +465,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
|
||||
|
||||
std::unique_ptr<Tensor> wrapTensor;
|
||||
std::pair<void*, int> wrapSrcStorage;
|
||||
MemChunk wrapSrcStorage;
|
||||
if (getDataType(srcTensor) != getDataType(dstTensor)) {
|
||||
auto dimType = Tensor::CAFFE;
|
||||
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
|
||||
|
@ -486,7 +489,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
|
||||
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
|
||||
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
|
||||
wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
|
||||
wrapTensor.get()->buffer().device = (uint64_t)(wrapSrcStorage.ptr());
|
||||
|
||||
auto dstType = getDataType(dstTensor);
|
||||
if (dstType != DataType_DT_FLOAT) {
|
||||
|
|
|
@ -41,7 +41,7 @@ public:
|
|||
virtual float onGetMemoryInMB() override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||
std::shared_ptr<EagerBufferAllocator> mBufferPool;
|
||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||
bool mIsCreateError{false};
|
||||
BackendConfig::PrecisionMode mDefaultPrecision;
|
||||
|
|
|
@ -118,9 +118,9 @@ ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
if(mSplitKernel) {
|
||||
mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM;
|
||||
auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes);
|
||||
mTempDataBuffer = (void*)((uint8_t*)buffer_data.first + buffer_data.second);
|
||||
mTempDataBuffer = (void*)(buffer_data.ptr());
|
||||
auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t));
|
||||
mTempIndexBuffer = (void*)((uint8_t*)buffer_index.first + buffer_index.second);
|
||||
mTempIndexBuffer = (void*)(buffer_index.ptr());
|
||||
pool->free(buffer_data);
|
||||
pool->free(buffer_index);
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ public:
|
|||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
protected:
|
||||
std::pair<void*, int> mConstBuffer;
|
||||
MemChunk mConstBuffer;
|
||||
const Op *mOp;
|
||||
int mTotalCount;
|
||||
constBuffer parameters;
|
||||
|
|
|
@ -155,7 +155,7 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &input
|
|||
|
||||
// Alloc temp cuda memory
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
std::pair<void*, size_t> buffer_input, buffer_im2col;
|
||||
MemChunk buffer_input, buffer_im2col;
|
||||
if(mFp16Fp32MixInfer) {
|
||||
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
||||
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
|
||||
|
|
|
@ -31,12 +31,23 @@ public:
|
|||
// Do nothing
|
||||
}
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
||||
mMaxFuseBufferSize = 0;
|
||||
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(outputs[0]);
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
if (1 == mLoop->commands()->size()) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
auto op = cmd->op();
|
||||
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
|
||||
auto step = cmd->steps()->data();
|
||||
if (inputs.size() <= 3) {
|
||||
if (cmd->fuse() >= 0) {
|
||||
// Make Temp output buffer
|
||||
auto size = cmd->size()->data();
|
||||
mMaxFuseBufferSize = bytes * size[0] * size[2];
|
||||
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||
pool->free(buffer);
|
||||
}
|
||||
auto& unit = mExecutions[0];
|
||||
int as = 1, bs = 1, cs = 1;
|
||||
if (step[1] == 0) {
|
||||
|
@ -77,11 +88,28 @@ public:
|
|||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
auto op = cmd->op();
|
||||
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
||||
if (cmd->fuse() >= 0) {
|
||||
// Make Temp output buffer
|
||||
auto size = cmd->size()->data();
|
||||
mMaxFuseBufferSize = mLoop->loopNumber() * bytes * size[0] * size[1] * size[2];
|
||||
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||
pool->free(buffer);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
}
|
||||
for (int i=0; i<mLoop->commands()->size(); ++i) {
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
|
||||
if (cmd->fuse() >= 0) {
|
||||
// Make Temp output buffer
|
||||
auto size = cmd->size()->data();
|
||||
if (cmd->op()->type() == OpType_MatMul) {
|
||||
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
|
||||
} else {
|
||||
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
|
||||
}
|
||||
}
|
||||
auto op = cmd->op();
|
||||
auto& unit = mExecutions[i];
|
||||
// Find indice and copy to cpu
|
||||
|
@ -141,6 +169,11 @@ public:
|
|||
continue;
|
||||
}
|
||||
}
|
||||
if(mMaxFuseBufferSize > 0) {
|
||||
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||
pool->free(buffer);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -161,9 +194,7 @@ public:
|
|||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
auto op = cmd->op();
|
||||
|
||||
|
||||
|
||||
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
||||
if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
|
||||
Tensor::InsideDescribe::Region reg;
|
||||
auto srcView = cmd->view()->GetAs<View>(1);
|
||||
auto dstView = cmd->view()->GetAs<View>(0);
|
||||
|
@ -187,14 +218,36 @@ public:
|
|||
if (index1 >= 0) {
|
||||
srcIndice = (int32_t*)originInputs[index1]->deviceId();
|
||||
}
|
||||
|
||||
auto src = (uint8_t*)(input->deviceId()) + srcView->offset() * bytes;
|
||||
auto dstOrigin = (output->deviceId()) + dstView->offset() * bytes;
|
||||
auto dst = dstOrigin;
|
||||
if(cmd->fuse() >= 0) {
|
||||
dst = (uint64_t)mFuseBuffer;
|
||||
}
|
||||
BlitWithIndice(
|
||||
(uint8_t*)(output->deviceId()) + dstView->offset() * bytes,
|
||||
(uint8_t*)(input->deviceId()) + srcView->offset() * bytes,
|
||||
(uint8_t*)dst,
|
||||
(uint8_t*)src,
|
||||
dstIndice, srcIndice, index0, index1,
|
||||
loopNumber, step0, step1, input->elementSize(),
|
||||
reg, bytes, runtime);
|
||||
|
||||
|
||||
if(cmd->fuse() >= 0) {
|
||||
auto opType = cmd->fuse();
|
||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
auto srcStride0 = dstStride;
|
||||
auto srcStride1 = dstStride;
|
||||
int32_t tmpSize[3];
|
||||
::memcpy(tmpSize, cmd->size()->data(), 3 * sizeof(int32_t));
|
||||
tmpSize[0] *= loopNumber;
|
||||
auto type = halide_type_of<float>();
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
type.bits = 16;
|
||||
}
|
||||
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
||||
tmpSize, srcStride0, srcStride1, dstStride, type, runtime, opType);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
}
|
||||
|
@ -220,12 +273,28 @@ public:
|
|||
offset = offset * cmd->steps()->data()[v] + view->offset();
|
||||
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
|
||||
}
|
||||
if (OpType_UnaryOp == op->type()) {
|
||||
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
|
||||
auto dst = (float*)mStackPtr[cmd->indexes()->data()[0]];
|
||||
int unaryType = op->main_as_UnaryOp()->opType();
|
||||
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||
|
||||
auto dstOrigin = mStackPtr[cmd->indexes()->data()[0]];
|
||||
auto dst = dstOrigin;
|
||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
|
||||
int fuseOutputStride[3];
|
||||
if(cmd->fuse() >= 0) {
|
||||
dst = (uint64_t)mFuseBuffer;
|
||||
|
||||
dstStride = fuseOutputStride;
|
||||
auto cmdSize = cmd->size()->data();
|
||||
fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
|
||||
fuseOutputStride[1] = cmdSize[2];
|
||||
fuseOutputStride[2] = 1;
|
||||
}
|
||||
|
||||
if (OpType_UnaryOp == op->type()) {
|
||||
|
||||
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
|
||||
int unaryType = op->main_as_UnaryOp()->opType();
|
||||
|
||||
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||
UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
|
||||
continue;
|
||||
}
|
||||
|
@ -234,13 +303,13 @@ public:
|
|||
if (3 == size) {
|
||||
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
||||
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
||||
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
|
||||
unit.outputs[0]->buffer().device = dst;
|
||||
} else {
|
||||
MNN_ASSERT(4 == size);
|
||||
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
||||
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
||||
unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
|
||||
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
|
||||
unit.outputs[0]->buffer().device = dst;
|
||||
}
|
||||
unit.exe->onExecute(unit.inputs, unit.outputs);
|
||||
continue;
|
||||
|
@ -252,16 +321,33 @@ public:
|
|||
}
|
||||
auto src0 = mStackPtr[cmd->indexes()->data()[1]];
|
||||
auto src1 = mStackPtr[cmd->indexes()->data()[2]];
|
||||
auto dst = mStackPtr[cmd->indexes()->data()[0]];
|
||||
auto opType = op->main_as_BinaryOp()->opType();
|
||||
auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||
auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
|
||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
|
||||
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
|
||||
|
||||
}
|
||||
|
||||
|
||||
if(cmd->fuse() >= 0) {
|
||||
auto opType = cmd->fuse();
|
||||
auto dstOriginStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||
auto type = halide_type_of<float>();
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
type.bits = 16;
|
||||
}
|
||||
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||
int32_t cmdSize[3];
|
||||
::memcpy(cmdSize, cmd->size()->data(), 3*sizeof(int32_t));
|
||||
if(OpType_MatMul == op->type()) {
|
||||
cmdSize[1] = 1;
|
||||
dstStride = dstOriginStride;
|
||||
}
|
||||
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
||||
cmdSize, dstOriginStride, dstStride, dstOriginStride, type, runtime, opType);
|
||||
}
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
@ -274,6 +360,8 @@ private:
|
|||
std::vector<uint64_t> mStackPtr;
|
||||
std::map<Tensor*, Tensor*> mIndiceCopy;
|
||||
bool mSingleMatMul = false;
|
||||
int mMaxFuseBufferSize;
|
||||
void* mFuseBuffer;
|
||||
};
|
||||
|
||||
class LoopCreator : public CUDABackend::Creator {
|
||||
|
@ -283,6 +371,13 @@ public:
|
|||
if (op->main_type() != OpParameter_LoopParam) {
|
||||
return nullptr;
|
||||
}
|
||||
auto mLoop = op->main_as_LoopParam();
|
||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||
|
||||
if(cmd->fuse() >= 0) {
|
||||
// TODO: support afterwards
|
||||
return nullptr;//
|
||||
}
|
||||
return new CUDALoop(backend, op->main_as_LoopParam());
|
||||
}
|
||||
};
|
||||
|
|
|
@ -848,21 +848,21 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
|
||||
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
std::pair<void*, size_t> bufferAData, bufferBData;
|
||||
MemChunk bufferAData, bufferBData;
|
||||
size_t convertBytes = 2;
|
||||
if(mFp32Infer) {
|
||||
convertBytes = 4;
|
||||
}
|
||||
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
|
||||
bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
|
||||
mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second);
|
||||
mTempMatA = (void*)bufferAData.ptr();
|
||||
} else {
|
||||
mTempMatA = (void *)A->deviceId();
|
||||
}
|
||||
|
||||
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
|
||||
bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
|
||||
mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second);
|
||||
mTempMatB = (void*)bufferBData.ptr();
|
||||
} else {
|
||||
mTempMatB = (void *)B->deviceId();
|
||||
}
|
||||
|
|
|
@ -102,10 +102,10 @@ ErrorCode MultiInputConvDepthWiseExecution::onResize(const std::vector<Tensor *>
|
|||
// prepare mParams.mFilter and mParams.mBias
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
||||
|
||||
std::pair<void*, int> bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
|
||||
auto bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
|
||||
mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
|
||||
|
||||
std::pair<void*, int> bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
|
||||
auto bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
|
||||
mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
|
||||
|
||||
pool->free(bufferFilter);
|
||||
|
|
|
@ -82,19 +82,19 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
elementBytes = 4;
|
||||
}
|
||||
|
||||
std::pair<void*, int> bufferFilter;
|
||||
MemChunk bufferFilter;
|
||||
if(mNeedWeightFill) {
|
||||
bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]);
|
||||
mFilterAddr = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
|
||||
mFilterAddr = (void*)(bufferFilter.ptr());
|
||||
} else {
|
||||
mFilterAddr = (void*)inputs[1]->deviceId();
|
||||
}
|
||||
|
||||
// Copy Bias
|
||||
std::pair<void*, int> bufferBias;
|
||||
MemChunk bufferBias;
|
||||
if(mNeedBiasFill) {
|
||||
bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]);
|
||||
mBiasAddr = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
|
||||
mBiasAddr = (void*)(bufferBias.ptr());
|
||||
|
||||
} else {
|
||||
mBiasAddr = (void*)inputs[2]->deviceId();
|
||||
|
@ -107,10 +107,10 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
|
||||
mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer));
|
||||
|
||||
std::pair<void*, int> bufferIm2Col;
|
||||
MemChunk bufferIm2Col;
|
||||
if(mNeedIm2Col) {
|
||||
bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
||||
mIm2ColBuffer = (void*)((uint8_t*)bufferIm2Col.first + bufferIm2Col.second);
|
||||
mIm2ColBuffer = (void*)(bufferIm2Col.ptr());
|
||||
}
|
||||
|
||||
// free for Reuse
|
||||
|
|
|
@ -84,21 +84,21 @@ ErrorCode MultiInputDeconvExecution::onResize(const std::vector<Tensor*> &inputs
|
|||
|
||||
// Alloc temp cuda memory
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
std::pair<void*, size_t> buffer_input, buffer_im2col;
|
||||
MemChunk buffer_input, buffer_im2col;
|
||||
if(mFp16Fp32MixInfer) {
|
||||
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
||||
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
|
||||
mInputBuffer = (void*)buffer_input.ptr();
|
||||
} else {
|
||||
mInputBuffer = (void*)input->deviceId();
|
||||
}
|
||||
buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
|
||||
mIm2ColBuffer = (void*)((uint8_t*)buffer_im2col.first + buffer_im2col.second);
|
||||
mIm2ColBuffer = (void*)buffer_im2col.ptr();
|
||||
|
||||
mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]);
|
||||
std::pair<void*, int> buffer_filter;
|
||||
MemChunk buffer_filter;
|
||||
if(mNeedWeightFill) {
|
||||
buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
||||
mFilterAddr = (void*)((uint8_t*)buffer_filter.first + buffer_filter.second);
|
||||
mFilterAddr = (void*)buffer_filter.ptr();
|
||||
} else {
|
||||
mFilterAddr = (void*)inputs[1]->deviceId();
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ private:
|
|||
int mCount;
|
||||
int mChannel;
|
||||
int mArea;
|
||||
std::pair<void*, int> mPreluStorage;
|
||||
MemChunk mPreluStorage;
|
||||
bool mIsChannelShared = false;
|
||||
};
|
||||
|
||||
|
|
|
@ -203,12 +203,14 @@ UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
|
|||
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
|
||||
int count = size[0] * size[1] * size[2];
|
||||
|
||||
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
|
||||
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d, ptr:%p %p\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2], input, output);
|
||||
bool isThirdSizeVector = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||
bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||
bool isFirstSizeVector = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||
bool isStrideVector = (srcStride[0] % 2 == 0 || srcStride[0] == 1) && (srcStride[1] % 2 == 0 || srcStride[1] == 1) && (srcStride[2] % 2 == 0 || srcStride[2] == 1) && \
|
||||
(dstStride[0] % 2 == 0 || dstStride[0] == 1) && (dstStride[1] % 2 == 0 || dstStride[1] == 1) && (dstStride[2] % 2 == 0 || dstStride[2] == 1);
|
||||
bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector;
|
||||
if(count > 16384 && isSizeVector) {
|
||||
if(count > 16384 && isSizeVector && isStrideVector) {
|
||||
int32_t newSize[3], newSrcStride[3], newDstStride[3];
|
||||
newSize[0] = size[0];
|
||||
newSize[1] = size[1];
|
||||
|
|
|
@ -32,7 +32,7 @@ private:
|
|||
int mCount;
|
||||
int mChannel;
|
||||
int mArea;
|
||||
std::pair<void*, int> mScaleBiasStorage;
|
||||
MemChunk mScaleBiasStorage;
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
|
|
|
@ -31,7 +31,7 @@ private:
|
|||
Tensor mStorage;
|
||||
bool mNeedUnpackC4;
|
||||
ReduceParam mCpuParam;
|
||||
std::pair<void*, int> mParam;
|
||||
MemChunk mParam;
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
|
|
|
@ -235,23 +235,23 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
||||
|
||||
if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
|
||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||
pool->free(bufferIndices);
|
||||
pool->free(bufferValues);
|
||||
} else if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
|
||||
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
|
||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||
pool->free(bufferIndices);
|
||||
pool->free(bufferValues);
|
||||
} else {
|
||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
|
||||
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
|
||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||
pool->free(bufferIndices);
|
||||
pool->free(bufferValues);
|
||||
|
|
|
@ -41,13 +41,13 @@ protected:
|
|||
const Op* mOp = nullptr;
|
||||
|
||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
std::pair<void*, int> mGpuIm2ColParam;
|
||||
MemChunk mGpuIm2ColParam;
|
||||
|
||||
void* mIm2ColBuffer;
|
||||
|
||||
bool mIsConv1x1S1D1P0 = false;
|
||||
bool mNeedIm2Col = true;
|
||||
std::pair<void*, int> mGpuKernelParam;
|
||||
MemChunk mGpuKernelParam;
|
||||
bool mIsBlock = false;
|
||||
int mBlockNum = 1;
|
||||
|
||||
|
|
|
@ -71,13 +71,13 @@ private:
|
|||
CutlassGemmInfo mGemmInfo;
|
||||
|
||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||
std::pair<void*, int> mGpuIm2ColParam;
|
||||
MemChunk mGpuIm2ColParam;
|
||||
|
||||
void* mIm2ColBuffer;
|
||||
|
||||
bool mIsConv1x1S1D1P0 = false;
|
||||
bool mNeedIm2Col = true;
|
||||
std::pair<void*, int> mGpuKernelParam;
|
||||
MemChunk mGpuKernelParam;
|
||||
bool mIsBlock = false;
|
||||
int mBlockNum = 1;
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@ private:
|
|||
int mChannel;
|
||||
int mCount;
|
||||
int mArea;
|
||||
std::pair<void*, int> mScaleStorage;
|
||||
MemChunk mScaleStorage;
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
|
|
|
@ -35,7 +35,7 @@ private:
|
|||
int mChannel;
|
||||
int mCount;
|
||||
int mArea;
|
||||
std::pair<void*, int> mScaleStorage;
|
||||
MemChunk mScaleStorage;
|
||||
};
|
||||
|
||||
} // namespace CUDA
|
||||
|
|
|
@ -64,7 +64,7 @@ public:
|
|||
private:
|
||||
MetalRuntime(void* context);
|
||||
void* mContext = nullptr;
|
||||
std::shared_ptr<BufferAllocator> mStatic;
|
||||
std::shared_ptr<EagerBufferAllocator> mStatic;
|
||||
MetalTuneLevel mTuneLevel = Wide;
|
||||
std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
|
||||
|
||||
|
@ -76,7 +76,7 @@ private:
|
|||
};
|
||||
|
||||
|
||||
class MetalRuntimeAllocator : public BufferAllocator::Allocator {
|
||||
class MetalRuntimeAllocator : public EagerBufferAllocator::Allocator {
|
||||
public:
|
||||
class MetalBufferAlloc {
|
||||
public:
|
||||
|
@ -95,8 +95,8 @@ public:
|
|||
// Do nothing
|
||||
}
|
||||
virtual ~ MetalRuntimeAllocator() = default;
|
||||
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override;
|
||||
virtual void onRelease(std::pair<void*, size_t> ptr) override;
|
||||
virtual MemChunk onAlloc(size_t size, size_t align) override;
|
||||
virtual void onRelease(MemChunk ptr) override;
|
||||
|
||||
private:
|
||||
id<MTLDevice> mDevice;
|
||||
|
@ -127,7 +127,7 @@ public:
|
|||
id<MTLBuffer> getHostBuffer(size_t size) const;
|
||||
id<MTLBuffer> getConstBuffer(size_t size) const;
|
||||
public:
|
||||
MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime);
|
||||
MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime);
|
||||
virtual ~MetalBackend();
|
||||
const MetalRuntime* runtime() const {
|
||||
return mRuntime;
|
||||
|
@ -169,10 +169,10 @@ public:
|
|||
bool isCommandEncoderSet();
|
||||
void setOpEncoder() const;
|
||||
|
||||
BufferAllocator *getBufferPool() const {
|
||||
EagerBufferAllocator *getBufferPool() const {
|
||||
return mBufferPool.get();
|
||||
}
|
||||
BufferAllocator *getStaticBufferPool() const {
|
||||
EagerBufferAllocator *getStaticBufferPool() const {
|
||||
return mStaticBufferPool.get();
|
||||
}
|
||||
|
||||
|
@ -190,8 +190,8 @@ private:
|
|||
|
||||
std::vector<std::function<void(void)>> mOpEncoders;
|
||||
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||
std::shared_ptr<BufferAllocator> mStaticBufferPool;
|
||||
std::shared_ptr<EagerBufferAllocator> mBufferPool;
|
||||
std::shared_ptr<EagerBufferAllocator> mStaticBufferPool;
|
||||
|
||||
private:
|
||||
mutable id<MTLBuffer> mHostBuffer = nullptr;
|
||||
|
|
|
@ -50,9 +50,9 @@ void MetalBackend::addCreator(OpType t, Creator *c) {
|
|||
map->insert(std::make_pair(t, c));
|
||||
}
|
||||
|
||||
MetalBackend::MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
|
||||
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
|
||||
mRuntime = runtime;
|
||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
|
||||
mBufferPool.reset(new EagerBufferAllocator(EagerBufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
|
||||
mStaticBufferPool = staticMem;
|
||||
mShapeH2D = getConstBuffer(4 * sizeof(int));
|
||||
mShapeD2H = getConstBuffer(4 * sizeof(int));
|
||||
|
@ -67,16 +67,19 @@ void *MetalBackend::context() const {
|
|||
|
||||
class MetalMemRelease : public Backend::MemObj {
|
||||
public:
|
||||
MetalMemRelease(std::pair<void*, int> buffer, BufferAllocator* allocator) {
|
||||
MetalMemRelease(MemChunk buffer, EagerBufferAllocator* allocator) {
|
||||
mBuffer = buffer;
|
||||
mAllocator = allocator;
|
||||
}
|
||||
virtual ~ MetalMemRelease() {
|
||||
mAllocator->free(mBuffer);
|
||||
}
|
||||
MemChunk chunk() override {
|
||||
return mBuffer;
|
||||
}
|
||||
private:
|
||||
std::pair<void*, int> mBuffer;
|
||||
BufferAllocator* mAllocator;
|
||||
MemChunk mBuffer;
|
||||
EagerBufferAllocator* mAllocator;
|
||||
};
|
||||
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
||||
auto tensor = const_cast<Tensor *>(_tensor);
|
||||
|
@ -115,8 +118,8 @@ Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType stor
|
|||
}
|
||||
|
||||
// reuse if possible
|
||||
std::pair<void*, int> buffer;
|
||||
BufferAllocator* allocator = nullptr;
|
||||
MemChunk buffer;
|
||||
EagerBufferAllocator* allocator = nullptr;
|
||||
switch (storageType) {
|
||||
case Backend::STATIC: {
|
||||
buffer = mStaticBufferPool->alloc(size, false);
|
||||
|
@ -656,8 +659,8 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
|
|||
MetalRuntime::MetalRuntime(void* context) {
|
||||
mContext = context;
|
||||
auto ctx = (__bridge MNNMetalContext *)mContext;
|
||||
std::shared_ptr<BufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
|
||||
mStatic.reset(new BufferAllocator(allocator));
|
||||
std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
|
||||
mStatic.reset(new EagerBufferAllocator(allocator));
|
||||
mTunedInfo = new TunedInfo;
|
||||
}
|
||||
|
||||
|
@ -859,12 +862,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
|
|||
return setCache(std::make_pair(buffer, size));
|
||||
}
|
||||
|
||||
std::pair<void*, size_t> MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
|
||||
MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
|
||||
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
|
||||
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
|
||||
return std::make_pair((void *)mMetalBufferAlloc, 0);
|
||||
return MemChunk((void *)mMetalBufferAlloc, 0);
|
||||
}
|
||||
void MetalRuntimeAllocator::onRelease(std::pair<void*, size_t> ptr) {
|
||||
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
|
||||
delete (MetalBufferAlloc *)ptr.first;
|
||||
}
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
#include "MNN_generated.h"
|
||||
|
||||
#include "core/BufferAllocator.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "shape/SizeComputer.hpp"
|
||||
#include <map>
|
||||
|
@ -907,16 +908,6 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
|
|||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("Start onCopyBuffer !\n");
|
||||
#endif
|
||||
//int8
|
||||
if(srcTensor->getType().code == halide_type_int && srcTensor->getType().bits == 8){
|
||||
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
||||
copyToDeviceInt8(srcTensor, dstTensor);
|
||||
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
|
||||
copyFromDeviceInt8(srcTensor, dstTensor);
|
||||
}else{
|
||||
MNN_PRINT("onCopyBuffer int8 error !!! \n");
|
||||
}
|
||||
}else{
|
||||
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
||||
copyToDevice(srcTensor, dstTensor);
|
||||
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
|
||||
|
@ -926,7 +917,6 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
|
|||
}else{
|
||||
MNN_PRINT("onCopyBuffer float error !!! \n");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end onCopyBuffer !\n");
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
//
|
||||
// ArgMaxBufExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#include "backend/opencl/execution/buffer/ArgMaxBufExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
ArgMaxBufExecution::ArgMaxBufExecution(const std::string &compute, Backend* backend, const int axis) : Execution(backend) {
|
||||
mBuildOptions.emplace(compute);
|
||||
mAxis = axis;
|
||||
// Do nothing
|
||||
}
|
||||
ErrorCode ArgMaxBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
if(mAxis < 0){
|
||||
mAxis = input->dimensions() + mAxis;
|
||||
}
|
||||
int inside = 1;
|
||||
int outside = 1;
|
||||
for(int i = 0; i < mAxis; ++i){
|
||||
outside *= input->length(i);
|
||||
}
|
||||
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||
inside *= input->length(i);
|
||||
}
|
||||
int dim = input->length(mAxis);
|
||||
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||
|
||||
int batch = inputShape.at(0);
|
||||
int inputHeight = inputShape.at(1);
|
||||
int inputWidth = inputShape.at(2);
|
||||
int inputChannels = inputShape.at(3);
|
||||
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||
int outputBatch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int outputChannels = outputShape.at(3);
|
||||
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight),
|
||||
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||
};
|
||||
|
||||
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||
mKernel = runtime->buildKernel("argmax_buf", "argmax_width_buf", mBuildOptions);
|
||||
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||
mKernel = runtime->buildKernel("argmax_buf", "argmax_height_buf", mBuildOptions);
|
||||
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||
if(output->buffer().dimensions == 1){
|
||||
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_dim1_buf", mBuildOptions);
|
||||
}else{
|
||||
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_buf", mBuildOptions);
|
||||
}
|
||||
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||
mKernel = runtime->buildKernel("argmax_buf", "argmax_batch_buf", mBuildOptions);
|
||||
}
|
||||
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||
|
||||
uint32_t idx = 0;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(input));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(output));
|
||||
ret |= mKernel.setArg(idx++, inputWidth);
|
||||
ret |= mKernel.setArg(idx++, inputHeight);
|
||||
ret |= mKernel.setArg(idx++, inputChannels);
|
||||
ret |= mKernel.setArg(idx++, batch);
|
||||
ret |= mKernel.setArg(idx++, inputChannelBlocks);
|
||||
ret |= mKernel.setArg(idx++, outputWidth);
|
||||
ret |= mKernel.setArg(idx++, outputHeight);
|
||||
ret |= mKernel.setArg(idx++, outputChannels);
|
||||
ret |= mKernel.setArg(idx++, outputChannelBlocks);
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
|
||||
|
||||
std::string kernelName = "gargmax_buf";
|
||||
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode ArgMaxBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start ArgMaxBufExecution onExecute...");
|
||||
#endif
|
||||
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us ArgMax\n",costTime);
|
||||
#else
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end ArgMaxBufExecution onExecute...");
|
||||
#endif
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class ArgMaxBufCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||
}
|
||||
for (int i = 0; i < outputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||
}
|
||||
auto inputDimensionFromat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
|
||||
if(inputDimensionFromat == MNN_DATA_FORMAT_NC4HW4){
|
||||
return nullptr;
|
||||
}
|
||||
int axis = op->main_as_ArgMax()->axis();
|
||||
if (op->type() == OpType_ArgMax) {
|
||||
return new ArgMaxBufExecution("-DARGMAX", backend, axis);
|
||||
}else{
|
||||
return new ArgMaxBufExecution("", backend, axis);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMaxBuf__(OpType_ArgMax, BUFFER);
|
||||
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMinBuf__(OpType_ArgMin, BUFFER);
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,43 @@
|
|||
//
|
||||
// ArgMaxBufExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#ifndef ArgMaxBufExecution_hpp
|
||||
#define ArgMaxBufExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "MNN_generated.h"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class ArgMaxBufExecution : public Execution {
|
||||
public:
|
||||
ArgMaxBufExecution(const std::string &compute, Backend *backend, const int axis);
|
||||
virtual ~ArgMaxBufExecution() = default;
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
cl::Kernel mKernel;
|
||||
uint32_t mMaxWorkGroupSize;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||
std::set<std::string> mBuildOptions;
|
||||
int mAxis;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* ArgMaxBufExecution_hpp */
|
||||
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,161 @@
|
|||
//
|
||||
// CastBufExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#include "backend/opencl/execution/buffer/CastBufExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
CastBufExecution::CastBufExecution(const std::string& compute, Backend* backend) : Execution(backend) {
|
||||
mBuildOptions.emplace(compute);
|
||||
}
|
||||
ErrorCode CastBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
Tensor* input = inputs[0];
|
||||
Tensor* output = outputs[0];
|
||||
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
|
||||
if (runtime->isSupportedIntelSubgroup()) {
|
||||
return SubgrouponResize(inputs, outputs);
|
||||
}
|
||||
#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
|
||||
mKernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions);
|
||||
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||
|
||||
int batch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int channels = outputShape.at(3);
|
||||
|
||||
int channelBlocks = (channels + 3) / 4;
|
||||
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight),
|
||||
static_cast<uint32_t>(batch * channelBlocks),
|
||||
};
|
||||
|
||||
uint32_t idx = 0;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(input));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(output));
|
||||
ret |= mKernel.setArg(idx++, outputWidth);
|
||||
ret |= mKernel.setArg(idx++, outputHeight);
|
||||
ret |= mKernel.setArg(idx++, channelBlocks);
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg CastBufExecution");
|
||||
|
||||
std::string kernelName = "cast_buf";
|
||||
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode CastBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start CastBufExecution onExecute...");
|
||||
#endif
|
||||
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us Cast\n",costTime);
|
||||
#else
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end CastBufExecution onExecute...");
|
||||
#endif
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
static DataType _mapDataType(DataType src) {
|
||||
if (DataType_DT_BOOL == src) {
|
||||
return DataType_DT_INT32;
|
||||
}
|
||||
if (DataType_DT_INT64 == src) {
|
||||
return DataType_DT_INT32;
|
||||
}
|
||||
if (DataType_DT_DOUBLE == src) {
|
||||
return DataType_DT_FLOAT;
|
||||
}
|
||||
return src;
|
||||
}
|
||||
|
||||
class CastBufCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||
}
|
||||
for (int i = 0; i < outputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||
}
|
||||
auto cast = op->main_as_CastParam();
|
||||
// cast param srcT is invalid
|
||||
// auto srcT = _mapDataType(cast->srcT());
|
||||
auto dstT = _mapDataType(cast->dstT());
|
||||
|
||||
const auto &inputDataType = inputs[0]->getType();
|
||||
if (inputDataType.bytes() == 4 && cast->dstT() == MNN::DataType_DT_BOOL) {
|
||||
return new CastBufExecution("-DTO_BOOL", backend);
|
||||
}
|
||||
if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
|
||||
return new CastBufExecution("", backend);
|
||||
}
|
||||
MNN_PRINT("Don't support cast form %d, %d to %d\n", inputDataType.code, inputDataType.bits, cast->dstT());
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<CastBufCreator> __CastBuf__(OpType_Cast, BUFFER);
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
|||
//
|
||||
// CastBufExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#ifndef CastBufExecution_hpp
|
||||
#define CastBufExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "MNN_generated.h"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class CastBufExecution : public Execution {
|
||||
public:
|
||||
CastBufExecution(const std::string &compute, Backend *backend);
|
||||
virtual ~CastBufExecution() = default;
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
cl::Kernel mKernel;
|
||||
uint32_t mMaxWorkGroupSize;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* CastBufExecution_hpp */
|
||||
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,110 @@
|
|||
//
|
||||
// RangeBufExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#include "backend/opencl/execution/buffer/RangeBufExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
RangeBufExecution::RangeBufExecution(const std::string &compute, Backend* backend) : Execution(backend) {
|
||||
mBuildOptions.emplace(compute);
|
||||
// Do nothing
|
||||
}
|
||||
ErrorCode RangeBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||
mKernel = runtime->buildKernel("range_buf", "range_buf", mBuildOptions);
|
||||
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||
|
||||
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
|
||||
|
||||
int batch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int channels = outputShape.at(3);
|
||||
int channelBlocks = (channels + 3) / 4;
|
||||
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight),
|
||||
static_cast<uint32_t>(batch * channelBlocks)
|
||||
};
|
||||
|
||||
uint32_t idx = 0;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
|
||||
ret |= mKernel.setArg(idx++, outputWidth);
|
||||
ret |= mKernel.setArg(idx++, outputHeight);
|
||||
ret |= mKernel.setArg(idx++, channels);
|
||||
ret |= mKernel.setArg(idx++, channelBlocks);
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg RangeBufExecution");
|
||||
|
||||
std::string kernelName = "range_buf";
|
||||
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode RangeBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start RangeBufExecution onExecute...");
|
||||
#endif
|
||||
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us Range\n",costTime);
|
||||
#else
|
||||
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end RangeBufExecution onExecute...");
|
||||
#endif
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class RangeBufCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||
}
|
||||
for (int i = 0; i < outputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||
}
|
||||
auto code = inputs[0]->getType().code;
|
||||
switch (code) {
|
||||
case halide_type_int:
|
||||
return new RangeBufExecution("-DUSE_INT", backend);
|
||||
case halide_type_float:
|
||||
return new RangeBufExecution("-DUSE_FLOAT", backend);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<RangeBufCreator> __RangeBuf__(OpType_Range, BUFFER);
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
|||
//
|
||||
// RangeBufExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#ifndef RangeBufExecution_hpp
|
||||
#define RangeBufExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "MNN_generated.h"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class RangeBufExecution : public Execution {
|
||||
public:
|
||||
RangeBufExecution(const std::string &compute, Backend *backend);
|
||||
virtual ~RangeBufExecution() = default;
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
cl::Kernel mKernel;
|
||||
uint32_t mMaxWorkGroupSize;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* RangeBufExecution_hpp */
|
||||
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -20,12 +20,7 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
|
|||
MNN_PRINT("start ReductionBufExecution init !\n");
|
||||
#endif
|
||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto reduct = op->main_as_ReductionParam();
|
||||
if (nullptr != reduct->dim()) {
|
||||
for (int i = 0; i < reduct->dim()->size(); ++i) {
|
||||
mAxis.push_back(reduct->dim()->data()[i]);
|
||||
}
|
||||
}
|
||||
mAxis = op->main_as_ReductionParam()->dim()->data()[0];
|
||||
switch (op->main_as_ReductionParam()->operation()) {
|
||||
case ReductionType_MEAN:
|
||||
mReductType = 0;
|
||||
|
@ -51,44 +46,129 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
|
|||
#endif
|
||||
}
|
||||
|
||||
int ReductionBufExecution::getLocalSize(int size, int maxGroupSize){
|
||||
int local_size = 1;
|
||||
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
|
||||
local_size *= 2;
|
||||
}
|
||||
return local_size;
|
||||
}
|
||||
|
||||
ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
||||
MNN_ASSERT(mAxis.size() == 1);
|
||||
MNN_ASSERT(mAxis[0] == 1);
|
||||
|
||||
auto runtime = mOpenCLBackend->getOpenCLRuntime();
|
||||
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
//N=outside H=axis W=inside C=1
|
||||
if(mAxis < 0){
|
||||
mAxis = input->dimensions() + mAxis;
|
||||
}
|
||||
int inside = 1;
|
||||
int outside = 1;
|
||||
for(int i = 0; i < mAxis; ++i){
|
||||
outside *= input->length(i);
|
||||
}
|
||||
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||
inside *= input->length(i);
|
||||
}
|
||||
int dim = input->length(mAxis);
|
||||
int local_size = 0;
|
||||
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
|
||||
|
||||
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
||||
mLocalWorkSize = {1, 1, 1};
|
||||
if(dim >= 16){
|
||||
mUseLocal = true;
|
||||
}
|
||||
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||
|
||||
int batch = inputShape.at(0);
|
||||
int inputHeight = inputShape.at(1);
|
||||
int inputWidth = inputShape.at(2);
|
||||
int inputChannels = inputShape.at(3);
|
||||
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||
int outputBatch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int outputChannels = outputShape.at(3);
|
||||
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||
|
||||
std::set<std::string> buildOption;
|
||||
switch (mReductType) {
|
||||
case 0:
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
buildOption.emplace("-DGET_AVG");
|
||||
buildOption.emplace("-DVALUE=0");
|
||||
break;
|
||||
case 1:
|
||||
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
||||
buildOption.emplace("-DVALUE=-FLT_MAX");
|
||||
break;
|
||||
case 2:
|
||||
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
||||
buildOption.emplace("-DVALUE=FLT_MAX");
|
||||
break;
|
||||
case 3:
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
||||
buildOption.emplace("-DVALUE=1");
|
||||
break;
|
||||
case 4:
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
buildOption.emplace("-DVALUE=0");
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_buf", buildOption);
|
||||
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight),
|
||||
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||
};
|
||||
|
||||
if(mUseLocal){
|
||||
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
|
||||
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
|
||||
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
if(output->buffer().dimensions == 1){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
|
||||
}else{
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
|
||||
}
|
||||
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||
local_size = getLocalSize(batch, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
|
||||
}
|
||||
mGlobalWorkSize[0] *= local_size;
|
||||
}else{
|
||||
buildOption.emplace("-DLOCAL_SIZE=0");
|
||||
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
|
||||
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
|
||||
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||
if(output->buffer().dimensions == 1){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
|
||||
}else{
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
|
||||
}
|
||||
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
|
||||
}
|
||||
}
|
||||
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
|
||||
|
||||
mUnits.resize(1);
|
||||
|
@ -96,14 +176,27 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
|
|||
cl_int ret = CL_SUCCESS;
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input));
|
||||
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputWidth);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputHeight);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputChannels);
|
||||
ret |= mReduct1DKernel.setArg(idx++, batch);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");
|
||||
|
||||
if(mUseLocal){
|
||||
mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
|
||||
}else{
|
||||
auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
|
||||
std::string kernelName = "reduct_buf";
|
||||
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mReduct1DKernel).first;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -114,12 +207,12 @@ ErrorCode ReductionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
||||
#else
|
||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
|
@ -140,7 +233,7 @@ public:
|
|||
for (int i = 0; i < outputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||
}
|
||||
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
|
||||
|
||||
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto reduct = op->main_as_ReductionParam();
|
||||
if (nullptr == reduct->dim()) {
|
||||
|
@ -166,8 +259,6 @@ public:
|
|||
}
|
||||
return new ReductionBufExecution(op, backend);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<ReductionBufCreator> __reductionBuf_op(OpType_Reduction, BUFFER);
|
||||
|
|
|
@ -30,12 +30,13 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
private:
|
||||
int getLocalSize(int size, int maxGroupSize);
|
||||
cl::Kernel mReduct1DKernel;
|
||||
std::string mKernelName;
|
||||
OpenCLBackend *mOpenCLBackend;
|
||||
MNN::DataType mdataType;
|
||||
int mReductType;
|
||||
std::vector<int> mAxis;
|
||||
int mAxis;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
||||
bool mUseLocal = false;
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
//
|
||||
// SelectBufExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#include "backend/opencl/execution/buffer/SelectBufExecution.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
SelectBufExecution::SelectBufExecution(Backend* backend) : Execution(backend) {
|
||||
// Do nothing
|
||||
}
|
||||
ErrorCode SelectBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
auto inSize1 = inputs[1]->elementSize();
|
||||
auto inSize2 = inputs[2]->elementSize();
|
||||
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||
if(inSize1 == 1)
|
||||
mBuildOptions.emplace("-DINSIZE1_EUQAL_1");
|
||||
if(inSize2 == 1)
|
||||
mBuildOptions.emplace("-DINSIZE2_EUQAL_1");
|
||||
mKernel = runtime->buildKernel("select_buf", "select_buf", mBuildOptions);
|
||||
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||
|
||||
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
|
||||
|
||||
int batch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int channels = outputShape.at(3);
|
||||
int channelBlocks = (channels + 3) / 4;
|
||||
int outSize = batch * channelBlocks * outputWidth * outputHeight * 4;
|
||||
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outSize),
|
||||
1
|
||||
};
|
||||
|
||||
uint32_t idx = 0;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[1]));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
|
||||
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg SelectBufExecution");
|
||||
|
||||
std::string kernelName = "select_buf";
|
||||
mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode SelectBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start SelectBufExecution onExecute...");
|
||||
#endif
|
||||
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us Select\n",costTime);
|
||||
#else
|
||||
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("end SelectBufExecution onExecute...");
|
||||
#endif
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
class SelectBufCreator : public OpenCLBackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||
}
|
||||
for (int i = 0; i < outputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||
}
|
||||
return new SelectBufExecution(backend);
|
||||
}
|
||||
};
|
||||
|
||||
OpenCLCreatorRegister<SelectBufCreator> __SelectBuf__(OpType_Select, BUFFER);
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
|||
//
|
||||
// SelectBufExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2023/08/11.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||
#ifndef SelectBufExecution_hpp
|
||||
#define SelectBufExecution_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include "MNN_generated.h"
|
||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
namespace OpenCL {
|
||||
|
||||
class SelectBufExecution : public Execution {
|
||||
public:
|
||||
SelectBufExecution(Backend *backend);
|
||||
virtual ~SelectBufExecution() = default;
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
cl::Kernel mKernel;
|
||||
uint32_t mMaxWorkGroupSize;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||
std::set<std::string> mBuildOptions;
|
||||
};
|
||||
|
||||
} // namespace OpenCL
|
||||
} // namespace MNN
|
||||
#endif /* SelectBufExecution_hpp */
|
||||
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -19,7 +19,6 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
|
|||
: Execution(backend) {
|
||||
mAxis = axis;
|
||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
buildSoftmaxKernel();
|
||||
}
|
||||
|
||||
bool SoftmaxBufExecution::buildSoftmaxKernel() {
|
||||
|
@ -44,9 +43,26 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con
|
|||
Tensor *input = inputs[0];
|
||||
Tensor *output = outputs[0];
|
||||
|
||||
const auto dims = input->buffer().dimensions;
|
||||
int inside = 1;
|
||||
int outside = 1;
|
||||
int channel = 1;
|
||||
for (int i = 0; i < mAxis; ++i) {
|
||||
outside *= input->length(i);
|
||||
}
|
||||
channel = input->length(mAxis);
|
||||
for (int i = mAxis + 1; i < dims; ++i) {
|
||||
inside *= input->length(i);
|
||||
}
|
||||
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||
|
||||
const int inputBatch = inputShape.at(0);
|
||||
const int inputHeight = inputShape.at(1);
|
||||
const int inputWidth = inputShape.at(2);
|
||||
const int inputChannels = inputShape.at(3);
|
||||
|
||||
const int outputBatch = outputShape.at(0);
|
||||
const int outputHeight = outputShape.at(1);
|
||||
const int outputWidth = outputShape.at(2);
|
||||
|
@ -54,9 +70,18 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con
|
|||
|
||||
const int channelBlocks = UP_DIV(outputChannels, 4);
|
||||
const int remainChannels = channelBlocks * 4 - outputChannels;
|
||||
if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
|
||||
mAxis = 1;
|
||||
}else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
|
||||
mAxis = 2;
|
||||
}else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
|
||||
mAxis = 3;
|
||||
}
|
||||
buildSoftmaxKernel();
|
||||
|
||||
if (mAxis == 1) {
|
||||
mGlobalWorkSize = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight * outputBatch)};
|
||||
mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight * outputBatch), 1};
|
||||
int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
|
||||
|
||||
uint32_t idx = 0;
|
||||
|
@ -132,10 +157,6 @@ class SoftmaxBufCreator : public OpenCLBackend::Creator {
|
|||
public:
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
if(inputs[0]->dimensions() == 3 || outputs[0]->dimensions() == 3){
|
||||
MNN_PRINT("softmax not support dimensions == 3 \n");
|
||||
return nullptr;
|
||||
}
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,254 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define GLOBAL_SIZE_3_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
|
||||
int4 index = 0;
|
||||
FLOAT4 maxValue = vload4(0, input + offset);
|
||||
for(int i = 1; i < inputWidth; ++i){
|
||||
FLOAT4 value = vload4(i, input + offset);
|
||||
#ifdef ARGMAX
|
||||
index = maxValue < value ? (int4)i : index;
|
||||
maxValue = fmax(maxValue, value);
|
||||
#else
|
||||
index = maxValue > value ? (int4)i : index;
|
||||
maxValue = fmin(maxValue, value);
|
||||
#endif
|
||||
}
|
||||
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||
}
|
||||
|
||||
|
||||
__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||
int4 index = 0;
|
||||
FLOAT4 maxValue = vload4(0, input + offset);
|
||||
for(int i = 1; i < inputHeight; ++i){
|
||||
FLOAT4 value = vload4(i * inputWidth, input + offset);
|
||||
#ifdef ARGMAX
|
||||
index = maxValue < value ? (int4)i : index;
|
||||
maxValue = fmax(maxValue, value);
|
||||
#else
|
||||
index = maxValue > value ? (int4)i : index;
|
||||
maxValue = fmin(maxValue, value);
|
||||
#endif
|
||||
}
|
||||
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||
}
|
||||
|
||||
__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int index = 0;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
#ifdef ARGMAX
|
||||
FLOAT maxValue = (FLOAT)-FLT_MAX;
|
||||
#else
|
||||
FLOAT maxValue = (FLOAT)FLT_MAX;
|
||||
#endif
|
||||
FLOAT4 value;
|
||||
FLOAT *valuePtr = (FLOAT*)&value;
|
||||
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||
value = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < 4; ++j){
|
||||
#ifdef ARGMAX
|
||||
if(maxValue < valuePtr[j]){
|
||||
index = i * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#else
|
||||
if(maxValue > valuePtr[j]){
|
||||
index = i * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
#ifdef ARGMAX
|
||||
if(maxValue < valuePtr[j]){
|
||||
index = (inputChannelBlock - 1) * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#else
|
||||
if(maxValue > valuePtr[j]){
|
||||
index = (inputChannelBlock - 1) * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
output[outputOffset] = (FLOAT)index;
|
||||
}
|
||||
|
||||
__kernel void argmax_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||
int index = 0;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
#ifdef ARGMAX
|
||||
FLOAT maxValue = (FLOAT)-FLT_MAX;
|
||||
#else
|
||||
FLOAT maxValue = (FLOAT)FLT_MAX;
|
||||
#endif
|
||||
FLOAT4 value;
|
||||
FLOAT *valuePtr = (FLOAT*)&value;
|
||||
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||
value = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < 4; ++j){
|
||||
#ifdef ARGMAX
|
||||
if(maxValue < valuePtr[j]){
|
||||
index = i * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#else
|
||||
if(maxValue > valuePtr[j]){
|
||||
index = i * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
#ifdef ARGMAX
|
||||
if(maxValue < valuePtr[j]){
|
||||
index = (inputChannelBlock - 1) * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#else
|
||||
if(maxValue > valuePtr[j]){
|
||||
index = (inputChannelBlock - 1) * 4 + j;
|
||||
maxValue = valuePtr[j];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
output[outputOffset] = (FLOAT)index;
|
||||
}
|
||||
|
||||
|
||||
__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||
|
||||
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int4 index = 0;
|
||||
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||
FLOAT4 maxValue = vload4(0, input + offset);
|
||||
for(int i = 1; i < inputBatch; ++i){
|
||||
FLOAT4 value = vload4(i * batchOffset, input + offset);
|
||||
#ifdef ARGMAX
|
||||
index = maxValue < value ? (int4)i : index;
|
||||
maxValue = fmax(maxValue, value);
|
||||
#else
|
||||
index = maxValue > value ? (int4)i : index;
|
||||
maxValue = fmin(maxValue, value);
|
||||
#endif
|
||||
}
|
||||
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define GLOBAL_SIZE_3_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
__kernel void cast_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / channelBlock;
|
||||
const int channel_idx = batch_channel_idx % channelBlock;
|
||||
|
||||
const int inp_offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
|
||||
#ifdef TO_BOOL
|
||||
int4 value = convert_int4(vload4(0, input + inp_offset));
|
||||
value = value == (int4)0 ? (int4)0 : (int4)1;
|
||||
vstore4(CONVERT_FLOAT4(value), 0, output + inp_offset);
|
||||
#else
|
||||
FLOAT4 value = vload4(0, input + inp_offset);
|
||||
vstore4(value, 0, output + inp_offset);
|
||||
#endif
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,40 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define GLOBAL_SIZE_3_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
__kernel void range_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input0,
|
||||
__global const FLOAT* input2,
|
||||
__global FLOAT* output,
|
||||
__private const int width,
|
||||
__private const int height,
|
||||
__private const int channel,
|
||||
__private const int channelBlock
|
||||
) {
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / channelBlock;
|
||||
const int channel_idx = batch_channel_idx % channelBlock;
|
||||
|
||||
const int offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
|
||||
const int channel4 = channel_idx << 2;
|
||||
int index = (((batch_idx * channel) + channel4) * height + height_idx) * width + width_idx;
|
||||
int size = height * width;
|
||||
int4 index4 = (int4)(index, index + size, index + size * 2, index + size * 3);
|
||||
FLOAT start = input0[0];
|
||||
FLOAT step = input2[0];
|
||||
FLOAT4 value = (FLOAT4)start + CONVERT_FLOAT4(index4) * (FLOAT4)step;
|
||||
vstore4(value, 0, output + offset);
|
||||
}
|
|
@ -11,308 +11,285 @@
|
|||
#define GLOBAL_SIZE_2_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||
|
||||
#define GLOBAL_SIZE_3_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
|
||||
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||
|
||||
|
||||
__kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
|
||||
__kernel void reduct_width(GLOBAL_SIZE_3_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
FLOAT4 sum = 0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum + in;
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
const int bh = batch_idx*inputHeight+height_idx;
|
||||
const int wc = channel_idx*inputWidth;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
|
||||
#if LOCAL_SIZE > 0
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x += sum_ptr[i];
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
|
||||
out = sum[0];
|
||||
#else
|
||||
for(int i = 0; i < inputWidth; ++i){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
|
||||
#endif
|
||||
|
||||
#ifdef GET_AVG
|
||||
out = out / inputWidth;
|
||||
#endif
|
||||
WI_F(output, (int2)(channel_idx, bh), out);
|
||||
}
|
||||
|
||||
|
||||
__kernel void reduct_height(GLOBAL_SIZE_3_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
FLOAT4 sum = 0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum + in;
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int width_idx = get_group_id(0);
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int bh = batch_idx*inputHeight;
|
||||
const int wc = channel_idx*inputWidth+width_idx;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x += sum_ptr[i];
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
out = sum[0];
|
||||
#else
|
||||
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int bh = batch_idx*inputHeight;
|
||||
const int wc = channel_idx*inputWidth+width_idx;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = 0; i < inputHeight; ++i){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GET_AVG
|
||||
out = out / inputHeight;
|
||||
#endif
|
||||
WI_F(output, (int2)(wc, batch_idx), out);
|
||||
}
|
||||
|
||||
__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
|
||||
__kernel void reduct_channel(GLOBAL_SIZE_3_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
FLOAT4 sum = (FLOAT4)-MAXFLOAT;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = max(sum, in);
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||
const int width_idx = get_group_id(0);
|
||||
|
||||
const int bh = batch_idx*inputHeight+height_idx;
|
||||
const int wc = width_idx;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x = max(sum.x, sum_ptr[i]);
|
||||
out.x = OPERATE(out.x, out.y);
|
||||
out.x = OPERATE(out.x, out.z);
|
||||
out.x = OPERATE(out.x, out.w);
|
||||
sum[lid] = out.x;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
out.x = sum[0];
|
||||
in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out.x = OPERATE(out.x, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out.x = out.x / inputChannel;
|
||||
#endif
|
||||
WI_F(output, (int2)(wc, bh), (FLOAT4)(out.x, 0, 0, 0));
|
||||
|
||||
#else
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||
|
||||
const int bh = batch_idx*inputHeight+height_idx;
|
||||
const int wc = width_idx;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
|
||||
FLOAT out = (FLOAT)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
|
||||
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
|
||||
for(int j = 0; j < 4; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
}
|
||||
in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out = out / inputChannel;
|
||||
#endif
|
||||
WI_F(output, (int2)(wc, bh), (FLOAT4)(out, 0, 0, 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
|
||||
__kernel void reduct_batch(GLOBAL_SIZE_3_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int channel_idx = get_global_id(2);
|
||||
|
||||
FLOAT4 sum = (FLOAT4)MAXFLOAT;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = min(sum, in);
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x = min(sum.x, sum_ptr[i]);
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
|
||||
const int width_idx = get_group_id(0);
|
||||
|
||||
__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
|
||||
FLOAT4 sum = (FLOAT4)1.0;
|
||||
for (int h = 0; h < height; h++) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
sum = sum * in;
|
||||
const int bh = height_idx;
|
||||
const int wc = channel_idx*inputWidth+width_idx;
|
||||
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* sum_ptr = (FLOAT*)∑
|
||||
for(int i = 1; i < channel; ++i){
|
||||
sum.x *= sum_ptr[i];
|
||||
}
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
FLOAT4 out = (FLOAT4)0.0;
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
out = out + in;
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x += out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
sum[idx] = sum[idx] + sum[idx + i];
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
out = sum[0];
|
||||
#ifdef GET_AVG
|
||||
out = out / inputBatch;
|
||||
#endif
|
||||
WI_F(output, (int2)(wc, bh), out);
|
||||
#else
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int channel_idx = get_global_id(2);
|
||||
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
FLOAT4 out = (FLOAT4)0.0;
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
out = out + in;
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x += out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
sum[idx] = sum[idx] + sum[idx + i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
FLOAT4 out = (FLOAT4)(-MAXFLOAT);
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
out = max(out, in);
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x = max(out.x, out_ptr[i]);
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
sum[idx] = max(sum[idx], sum[idx + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
FLOAT4 out = (FLOAT4)(MAXFLOAT);
|
||||
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
out = min(out, in);
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x = min(out.x, out_ptr[i]);
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
sum[idx] = min(sum[idx], sum[idx + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
) {
|
||||
const int batch_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(2);
|
||||
|
||||
const int idx = get_local_id(0);
|
||||
FLOAT local sum[256];
|
||||
FLOAT4 out = (FLOAT4)1.0;
|
||||
|
||||
const int reduce_num = get_local_size(0);
|
||||
|
||||
for (int h = idx; h < height; h+=reduce_num) {
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
||||
out = out * in;
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int i = 1; i < channel; ++i){
|
||||
out.x *= out_ptr[i];
|
||||
}
|
||||
sum[idx] = out.x;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
||||
if (idx < i)
|
||||
sum[idx] = sum[idx] * sum[idx + i];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (idx == 0) {
|
||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
||||
const int bh = height_idx;
|
||||
const int wc = channel_idx*inputWidth+width_idx;
|
||||
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = 0; i < inputBatch; ++i){
|
||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out = out / inputBatch;
|
||||
#endif
|
||||
WI_F(output, (int2)(wc, bh), out);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -9,31 +9,363 @@
|
|||
#define GLOBAL_SIZE_2_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||
|
||||
__kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
|
||||
#define GLOBAL_SIZE_3_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int batch,
|
||||
__private const int height,
|
||||
__private const int width,
|
||||
__private const int channel
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
const int batch_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
|
||||
FLOAT4 out = vload4(0, input + inp_offset);
|
||||
for (int h = 1; h < height; h++) {
|
||||
FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
|
||||
#if LOCAL_SIZE > 0
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = vload4(i, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
FLOAT* out_ptr = (FLOAT*)&out;
|
||||
for(int c = 1; c < channel; ++c){
|
||||
out.x = OPERATE(out.x, out_ptr[c]);
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
out = sum[0];
|
||||
#else
|
||||
for(int i = 0; i < inputWidth; ++i){
|
||||
FLOAT4 in = vload4(i, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GET_AVG
|
||||
out.x = out.x / (height * channel);
|
||||
out = out / inputWidth;
|
||||
#endif
|
||||
vstore4(out, 0, output + outputOffset);
|
||||
}
|
||||
|
||||
|
||||
__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int width_idx = get_group_id(0);
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = vload4(i * inputWidth, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
out = sum[0];
|
||||
#else
|
||||
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||
|
||||
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = 0; i < inputHeight; ++i){
|
||||
FLOAT4 in = vload4(i * inputWidth, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GET_AVG
|
||||
out = out / inputHeight;
|
||||
#endif
|
||||
vstore4(out, 0, output + outputOffset);
|
||||
}
|
||||
|
||||
__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||
const int width_idx = get_group_id(0);
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
out.x = OPERATE(out.x, out.y);
|
||||
out.x = OPERATE(out.x, out.z);
|
||||
out.x = OPERATE(out.x, out.w);
|
||||
sum[lid] = out.x;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
out.x = sum[0];
|
||||
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out.x = OPERATE(out.x, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out.x = out.x / inputChannel;
|
||||
#endif
|
||||
output[outputOffset] = out.x;
|
||||
|
||||
#else
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
|
||||
FLOAT out = (FLOAT)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < 4; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
}
|
||||
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out = out / inputChannel;
|
||||
#endif
|
||||
output[outputOffset] = out;
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||
const int width_idx = get_group_id(0);
|
||||
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
out.x = OPERATE(out.x, out.y);
|
||||
out.x = OPERATE(out.x, out.z);
|
||||
out.x = OPERATE(out.x, out.w);
|
||||
sum[lid] = out.x;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
out.x = sum[0];
|
||||
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out.x = OPERATE(out.x, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out.x = out.x / inputChannel;
|
||||
#endif
|
||||
output[outputOffset] = out.x;
|
||||
|
||||
#else
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int batch_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||
FLOAT out = (FLOAT)VALUE;
|
||||
FLOAT4 in;
|
||||
FLOAT *inPtr = (FLOAT*)∈
|
||||
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < 4; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
}
|
||||
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||
for(int j = 0; j < remain; ++j){
|
||||
out = OPERATE(out, inPtr[j]);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out = out / inputChannel;
|
||||
#endif
|
||||
output[outputOffset] = out;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS
|
||||
__global const FLOAT* input,
|
||||
__global FLOAT* output,
|
||||
__private const int inputWidth,
|
||||
__private const int inputHeight,
|
||||
__private const int inputChannel,
|
||||
__private const int inputBatch,
|
||||
__private const int inputChannelBlock,
|
||||
__private const int oututWidth,
|
||||
__private const int outputHeight,
|
||||
__private const int outputChannel,
|
||||
__private const int outputChannelBlock
|
||||
) {
|
||||
#if LOCAL_SIZE > 0
|
||||
const int width_local_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
|
||||
const int width_idx = get_group_id(0);
|
||||
|
||||
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||
const int lid = get_local_id(0);
|
||||
FLOAT4 local sum[LOCAL_SIZE];
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
|
||||
FLOAT4 in = vload4(i * batchOffset, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
sum[lid] = out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||
if (lid < i)
|
||||
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
out = sum[0];
|
||||
#ifdef GET_AVG
|
||||
out = out / inputBatch;
|
||||
#endif
|
||||
vstore4(out, 0, output + outputOffset);
|
||||
#else
|
||||
const int width_idx = get_global_id(0);
|
||||
const int height_idx = get_global_id(1);
|
||||
const int channel_idx = get_global_id(2);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||
|
||||
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||
FLOAT4 out = (FLOAT4)VALUE;
|
||||
for(int i = 0; i < inputBatch; ++i){
|
||||
FLOAT4 in = vload4(i * batchOffset, input + offset);
|
||||
out = OPERATE(out, in);
|
||||
}
|
||||
#ifdef GET_AVG
|
||||
out = out / inputBatch;
|
||||
#endif
|
||||
vstore4(out, 0, output + outputOffset);
|
||||
#endif
|
||||
const int out_offset = batch_idx * width + width_idx;
|
||||
vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
#ifdef MNN_SUPPORT_FP16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define GLOBAL_SIZE_2_DIMS \
|
||||
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||
|
||||
#define DEAL_NON_UNIFORM_DIM2(input1, input2) \
|
||||
if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
__kernel void select_buf(GLOBAL_SIZE_2_DIMS
|
||||
__global const FLOAT* select,
|
||||
__global const FLOAT* input0,
|
||||
__global const FLOAT* input1,
|
||||
__global FLOAT* output
|
||||
) {
|
||||
const int idx = get_global_id(0);
|
||||
const int idy = get_global_id(1);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM2(idx, idy);
|
||||
if ((int)select[idx]) {
|
||||
#ifdef INSIZE1_EUQAL_1
|
||||
output[idx] = input0[0];
|
||||
#else
|
||||
output[idx] = input0[idx];
|
||||
#endif
|
||||
} else {
|
||||
#ifdef INSIZE2_EUQAL_1
|
||||
output[idx] = input1[0];
|
||||
#else
|
||||
output[idx] = input1[idx];
|
||||
#endif
|
||||
}
|
||||
}
|
|
@ -15,90 +15,76 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
|
|||
|
||||
|
||||
__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
|
||||
__private const int remain_channels) {
|
||||
__private const int remain_channels, __private const int4 shape // NCHW
|
||||
) {
|
||||
|
||||
const int channel_block_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
const int batch_height_idx = get_global_id(2);
|
||||
const int width_idx = get_global_id(0);
|
||||
const int batch_height_idx = get_global_id(1);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
|
||||
|
||||
const int width = global_size_dim1;
|
||||
if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
|
||||
|
||||
FLOAT float_max_value = -FLT_MAX;
|
||||
FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
|
||||
FLOAT4 input_data;
|
||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.w);
|
||||
for (short i = 0; i < shape.y - 1; ++i) {
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
|
||||
float_max_value = max(float_max_value, input_data);
|
||||
}
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.y);
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.z);
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.w);
|
||||
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1 , batch_height_idx));
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w , batch_height_idx));
|
||||
if (remain_channels == 0) {
|
||||
float_max_value = max(float_max_value, input_data.w);
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||
float_max_value.x = max(float_max_value.x, input_data.w);
|
||||
} else if (remain_channels == 1) {
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
} else if (remain_channels == 2) {
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
} else if (remain_channels == 3) {
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
}
|
||||
|
||||
FLOAT accum_result = 0;
|
||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
|
||||
input_data = EXP(input_data - float_max_value);
|
||||
accum_result += input_data.x;
|
||||
accum_result += input_data.y;
|
||||
accum_result += input_data.z;
|
||||
accum_result += input_data.w;
|
||||
}
|
||||
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1, batch_height_idx));
|
||||
input_data -= float_max_value;
|
||||
FLOAT4 accum_result = 0;
|
||||
for (short i = 0; i < shape.y - 1; ++i) {
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
|
||||
input_data = EXP(input_data - float_max_value.x);
|
||||
accum_result += input_data;
|
||||
}
|
||||
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
|
||||
|
||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w, batch_height_idx));
|
||||
input_data -= float_max_value.x;
|
||||
if (remain_channels == 0) {
|
||||
accum_result += EXP(input_data.w);
|
||||
accum_result += EXP(input_data.z);
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.w);
|
||||
accum_result.x += EXP(input_data.z);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 1) {
|
||||
accum_result += EXP(input_data.z);
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.z);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 2) {
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 3) {
|
||||
accum_result += EXP(input_data.x);
|
||||
}
|
||||
|
||||
int cur_out_width_pos = mad24(channel_block_idx, global_size_dim1, width_idx);
|
||||
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value;
|
||||
const int output_remain = output_channels - mul24(channel_block_idx, 4);
|
||||
|
||||
if (output_remain == 1) {
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else if (output_remain == 2) {
|
||||
input_data.y = EXP(input_data.y) / accum_result;
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else if (output_remain == 3) {
|
||||
input_data.z = EXP(input_data.z) / accum_result;
|
||||
input_data.y = EXP(input_data.y) / accum_result;
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else{
|
||||
input_data = EXP(input_data) / accum_result;
|
||||
accum_result.x += EXP(input_data.x);
|
||||
}
|
||||
|
||||
for(int i = 0; i < shape.y; ++i){
|
||||
int cur_out_width_pos = mad24(i, shape.w, width_idx);
|
||||
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value.x;
|
||||
input_data = EXP(input_data) / accum_result.x;
|
||||
WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,
|
||||
|
|
|
@ -19,87 +19,74 @@ __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
|
|||
__private const int remain_channels,
|
||||
__private const int4 shape) {//NCHW
|
||||
|
||||
const int channel_block_idx = get_global_id(0);
|
||||
const int width_idx = get_global_id(1);
|
||||
const int batch_height_idx = get_global_id(2);
|
||||
const int width_idx = get_global_id(0);
|
||||
const int batch_height_idx = get_global_id(1);
|
||||
|
||||
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
|
||||
if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
|
||||
const int batch_idx = batch_height_idx / shape.z;
|
||||
const int height_idx = batch_height_idx % shape.z;
|
||||
const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;
|
||||
|
||||
FLOAT float_max_value = -FLT_MAX;
|
||||
FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
|
||||
FLOAT4 input_data;
|
||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
||||
for (short i = 0; i < shape.y - 1; ++i) {
|
||||
input_data = vload4(i*shape.z*shape.w, input+offset);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.w);
|
||||
float_max_value = max(float_max_value, input_data);
|
||||
}
|
||||
|
||||
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.y);
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.z);
|
||||
float_max_value.x = max(float_max_value.x, float_max_value.w);
|
||||
|
||||
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
|
||||
if (remain_channels == 0) {
|
||||
float_max_value = max(float_max_value, input_data.w);
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||
float_max_value.x = max(float_max_value.x, input_data.w);
|
||||
} else if (remain_channels == 1) {
|
||||
float_max_value = max(float_max_value, input_data.z);
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
} else if (remain_channels == 2) {
|
||||
float_max_value = max(float_max_value, input_data.y);
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
} else if (remain_channels == 3) {
|
||||
float_max_value = max(float_max_value, input_data.x);
|
||||
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||
}
|
||||
|
||||
FLOAT accum_result = 0;
|
||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
||||
FLOAT4 accum_result = 0;
|
||||
for (short i = 0; i < shape.y - 1; ++i) {
|
||||
input_data = vload4(i*shape.z*shape.w, input+offset);;
|
||||
input_data = EXP(input_data - float_max_value);
|
||||
accum_result += input_data.x;
|
||||
accum_result += input_data.y;
|
||||
accum_result += input_data.z;
|
||||
accum_result += input_data.w;
|
||||
input_data = EXP(input_data - float_max_value.x);
|
||||
accum_result += input_data;
|
||||
}
|
||||
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
|
||||
|
||||
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
|
||||
input_data -= float_max_value;
|
||||
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
|
||||
input_data -= float_max_value.x;
|
||||
if (remain_channels == 0) {
|
||||
accum_result += EXP(input_data.w);
|
||||
accum_result += EXP(input_data.z);
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.w);
|
||||
accum_result.x += EXP(input_data.z);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 1) {
|
||||
accum_result += EXP(input_data.z);
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.z);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 2) {
|
||||
accum_result += EXP(input_data.y);
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.y);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
} else if (remain_channels == 3) {
|
||||
accum_result += EXP(input_data.x);
|
||||
accum_result.x += EXP(input_data.x);
|
||||
}
|
||||
|
||||
input_data = vload4(channel_block_idx*shape.z*shape.w, input+offset) - float_max_value;
|
||||
const int output_remain = output_channels - mul24(channel_block_idx, 4);
|
||||
|
||||
if (output_remain == 1) {
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else if (output_remain == 2) {
|
||||
input_data.y = EXP(input_data.y) / accum_result;
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else if (output_remain == 3) {
|
||||
input_data.z = EXP(input_data.z) / accum_result;
|
||||
input_data.y = EXP(input_data.y) / accum_result;
|
||||
input_data.x = EXP(input_data.x) / accum_result;
|
||||
} else{
|
||||
input_data = EXP(input_data) / accum_result;
|
||||
for(int i = 0; i < shape.y; ++i){
|
||||
input_data = vload4(i*shape.z*shape.w, input+offset) - float_max_value.x;
|
||||
input_data = EXP(input_data) / accum_result.x;
|
||||
vstore4(input_data, i*shape.z*shape.w, output+offset);
|
||||
}
|
||||
}
|
||||
|
||||
vstore4(input_data, channel_block_idx*shape.z*shape.w, output+offset);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -18,12 +18,7 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
|
|||
MNN_PRINT("start ReductionExecution init !\n");
|
||||
#endif
|
||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto reduct = op->main_as_ReductionParam();
|
||||
if (nullptr != reduct->dim()) {
|
||||
for (int i = 0; i < reduct->dim()->size(); ++i) {
|
||||
mAxis.push_back(reduct->dim()->data()[i]);
|
||||
}
|
||||
}
|
||||
mAxis = op->main_as_ReductionParam()->dim()->data()[0];
|
||||
switch (op->main_as_ReductionParam()->operation()) {
|
||||
case ReductionType_MEAN:
|
||||
mReductType = 0;
|
||||
|
@ -49,110 +44,150 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
|
|||
#endif
|
||||
}
|
||||
|
||||
int ReductionExecution::getLocalSize(int size, int maxGroupSize){
|
||||
int local_size = 1;
|
||||
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
|
||||
local_size *= 2;
|
||||
}
|
||||
return local_size;
|
||||
}
|
||||
|
||||
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
|
||||
MNN_ASSERT(mAxis.size() == 1);
|
||||
MNN_ASSERT(mAxis[0] == 1);
|
||||
|
||||
auto runtime = mOpenCLBackend->getOpenCLRuntime();
|
||||
startRecord(runtime, mRecording);
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
//N=outside H=axis W=inside C=1
|
||||
MNN_ASSERT(inputShape[3] == 1);
|
||||
if(inputShape[1] >= 256) {
|
||||
if(mAxis < 0){
|
||||
mAxis = input->dimensions() + mAxis;
|
||||
}
|
||||
int inside = 1;
|
||||
int outside = 1;
|
||||
for(int i = 0; i < mAxis; ++i){
|
||||
outside *= input->length(i);
|
||||
}
|
||||
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||
inside *= input->length(i);
|
||||
}
|
||||
int dim = input->length(mAxis);
|
||||
int local_size = 0;
|
||||
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
|
||||
|
||||
if(dim >= 16){
|
||||
mUseLocal = true;
|
||||
}
|
||||
if(!mUseLocal) {
|
||||
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
||||
mLocalWorkSize = {1, 1, 1};
|
||||
|
||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||
|
||||
int batch = inputShape.at(0);
|
||||
int inputHeight = inputShape.at(1);
|
||||
int inputWidth = inputShape.at(2);
|
||||
int inputChannels = inputShape.at(3);
|
||||
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||
int outputBatch = outputShape.at(0);
|
||||
int outputHeight = outputShape.at(1);
|
||||
int outputWidth = outputShape.at(2);
|
||||
int outputChannels = outputShape.at(3);
|
||||
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||
|
||||
std::set<std::string> buildOption;
|
||||
switch (mReductType) {
|
||||
case 0:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean", {});
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
buildOption.emplace("-DGET_AVG");
|
||||
buildOption.emplace("-DVALUE=0");
|
||||
break;
|
||||
case 1:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max", {});
|
||||
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
||||
buildOption.emplace("-DVALUE=-FLT_MAX");
|
||||
break;
|
||||
case 2:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min", {});
|
||||
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
||||
buildOption.emplace("-DVALUE=FLT_MAX");
|
||||
break;
|
||||
case 3:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul", {});
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
||||
buildOption.emplace("-DVALUE=1");
|
||||
break;
|
||||
case 4:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum", {});
|
||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||
buildOption.emplace("-DVALUE=0");
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
} else { //useLocal
|
||||
uint32_t global_x = 8;
|
||||
int size = inputShape[1];
|
||||
if (size >= 1024) {
|
||||
global_x = 256;
|
||||
} else if(size >= 512) {
|
||||
global_x = 128;
|
||||
} else if (size >= 256) {
|
||||
global_x = 64;
|
||||
} else if (size >= 128) {
|
||||
global_x = 32;
|
||||
} else if (size >= 64) {
|
||||
global_x = 16;
|
||||
} else if (size >= 32) {
|
||||
global_x = 8;
|
||||
}
|
||||
mGlobalWorkSize = {global_x, static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
||||
mLocalWorkSize = {global_x, 1, 1 };
|
||||
|
||||
switch (mReductType) {
|
||||
case 0:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean_local", {});
|
||||
break;
|
||||
case 1:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max_local", {});
|
||||
break;
|
||||
case 2:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min_local", {});
|
||||
break;
|
||||
case 3:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul_local", {});
|
||||
break;
|
||||
case 4:
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum_local", {});
|
||||
break;
|
||||
default:
|
||||
MNN_ASSERT(false);
|
||||
break;
|
||||
mGlobalWorkSize = {
|
||||
static_cast<uint32_t>(outputWidth),
|
||||
static_cast<uint32_t>(outputHeight),
|
||||
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||
};
|
||||
|
||||
if(mUseLocal){
|
||||
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
|
||||
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
|
||||
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
|
||||
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||
local_size = getLocalSize(batch, MaxWorkItems[0]);
|
||||
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
|
||||
}
|
||||
mGlobalWorkSize[0] *= local_size;
|
||||
}else{
|
||||
buildOption.emplace("-DLOCAL_SIZE=0");
|
||||
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
|
||||
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
|
||||
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
|
||||
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
|
||||
}
|
||||
}
|
||||
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
|
||||
|
||||
mUnits.resize(1);
|
||||
uint32_t idx = 0;
|
||||
cl_int ret = CL_SUCCESS;
|
||||
if(mUseLocal) {
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
} else {
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||
}
|
||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||
ret |= mReduct1DKernel.setArg(idx++, openCLImage(input));
|
||||
ret |= mReduct1DKernel.setArg(idx++, openCLImage(output));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputWidth);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputHeight);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputChannels);
|
||||
ret |= mReduct1DKernel.setArg(idx++, batch);
|
||||
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
|
||||
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
|
||||
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution");
|
||||
|
||||
if(mUseLocal){
|
||||
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||
mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
|
||||
}else{
|
||||
recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||
auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
|
||||
std::string kernelName = "reduct";
|
||||
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, runtime, kernelName, mReduct1DKernel).first;
|
||||
}
|
||||
|
||||
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||
endRecord(runtime, mRecording);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
@ -164,13 +199,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
|||
|
||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||
cl::Event event;
|
||||
if(mUseLocal) {
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
} else {
|
||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
}
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
||||
#else
|
||||
|
@ -182,13 +211,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
|||
#endif
|
||||
return NO_ERROR;
|
||||
}
|
||||
if(mUseLocal) {
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
} else {
|
||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||
mOpenCLBackend->getOpenCLRuntime());
|
||||
}
|
||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||
#endif
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
|
@ -202,7 +225,6 @@ public:
|
|||
virtual ~ReductionCreator() = default;
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||
const MNN::Op *op, Backend *backend) const override {
|
||||
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
|
||||
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||
auto reduct = op->main_as_ReductionParam();
|
||||
if (nullptr == reduct->dim()) {
|
||||
|
@ -211,6 +233,12 @@ public:
|
|||
if(reduct->dim()->size() != 1) {
|
||||
return NULL;
|
||||
}
|
||||
auto axis = reduct->dim()->data()[0];
|
||||
int dim = inputs[0]->length(axis);
|
||||
std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
|
||||
if(dim == inputShape.at(3) && outputs[0]->buffer().dimensions == 1){
|
||||
return NULL;
|
||||
}
|
||||
switch (op->main_as_ReductionParam()->operation()) {
|
||||
case ReductionType_MEAN:
|
||||
break;
|
||||
|
@ -227,7 +255,6 @@ public:
|
|||
break;
|
||||
}
|
||||
return new ReductionExecution(op, backend);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -28,11 +28,12 @@ public:
|
|||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
private:
|
||||
int getLocalSize(int size, int maxGroupSize);
|
||||
cl::Kernel mReduct1DKernel;
|
||||
OpenCLBackend *mOpenCLBackend;
|
||||
MNN::DataType mdataType;
|
||||
int mReductType;
|
||||
std::vector<int> mAxis;
|
||||
int mAxis;
|
||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
||||
bool mUseLocal = false;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue