mirror of https://github.com/alibaba/MNN.git
Merge pull request #2580 from alibaba/feature/sync
[MNN:Sync] Sync Internal 2.7.0
This commit is contained in:
commit
9e3cc72952
|
@ -715,9 +715,7 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "^Android")
|
||||||
else()
|
else()
|
||||||
endif()
|
endif()
|
||||||
if (NOT MNN_BUILD_SHARED_LIBS)
|
if (NOT MNN_BUILD_SHARED_LIBS)
|
||||||
if(APPLE)
|
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||||
set(MNN_DEPS -Wl,-all_load ${MNN_DEPS} -Wl,-noall_load)
|
|
||||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
||||||
# Static-link will not replace thread-related weak symbol in glibc with strong symbol
|
# Static-link will not replace thread-related weak symbol in glibc with strong symbol
|
||||||
# in pthread library, so we need use --whole-archive to pthread
|
# in pthread library, so we need use --whole-archive to pthread
|
||||||
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
|
# https://stackoverflow.com/questions/35116327/when-g-static-link-pthread-cause-segmentation-fault-why
|
||||||
|
|
|
@ -473,15 +473,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_LOG1P:
|
case UnaryOpOperation_LOG1P:
|
||||||
if(mVectorize) {
|
if(mVectorize) {
|
||||||
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
|
ss << inpName << ".x=(half)(log(1.0+(float)" << operand << ".x));\n";
|
||||||
if(mPrecision != BackendConfig::Precision_Low) {
|
ss << inpName << ".y=(half)(log(1.0+(float)" << operand << ".y))";
|
||||||
|
} else {
|
||||||
|
ss << inpName << ".x=(log(1.0+" << operand << ".x));\n";
|
||||||
|
ss << inpName << ".y=(log(1.0+" << operand << ".y))";
|
||||||
ss << ";\n";
|
ss << ";\n";
|
||||||
ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
|
ss << inpName << ".z=(log(1.0+" << operand << ".z));\n";
|
||||||
ss << inpName << ".w=(log(1.0+" << operand << ".w))";
|
ss << inpName << ".w=(log(1.0+" << operand << ".w))";
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << inpName << "=(log(1.0+" << operand << "))";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << "=(log((half)1.0+" << operand << "))";
|
||||||
|
} else {
|
||||||
|
ss << inpName << "=(log(1.0+" << operand << "))";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_FLOOR:
|
case UnaryOpOperation_FLOOR:
|
||||||
|
@ -512,15 +519,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_SIGMOID:
|
case UnaryOpOperation_SIGMOID:
|
||||||
if(mVectorize) {
|
if(mVectorize) {
|
||||||
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
|
ss << inpName << ".x=(half)(1.0/(1.0+(float)exp(-" << operand << ".x)));\n";
|
||||||
if(mPrecision != BackendConfig::Precision_Low) {
|
ss << inpName << ".y=(half)(1.0/(1.0+(float)exp(-" << operand << ".y)))";
|
||||||
|
} else {
|
||||||
|
ss << inpName << ".x=(1.0/(1.0+exp(-" << operand << ".x)));\n";
|
||||||
|
ss << inpName << ".y=(1.0/(1.0+exp(-" << operand << ".y)))";
|
||||||
ss << ";\n";
|
ss << ";\n";
|
||||||
ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
|
ss << inpName << ".z=(1.0/(1.0+exp(-" << operand << ".z)));\n";
|
||||||
ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
|
ss << inpName << ".w=(1.0/(1.0+exp(-" << operand << ".w)))";
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << "=(half)(1.0/(1.0+(float)exp(-" << operand << ")))";
|
||||||
|
} else {
|
||||||
|
ss << inpName << "=(1.0/(1.0+exp(-" << operand << ")))";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_TANH:
|
case UnaryOpOperation_TANH:
|
||||||
|
@ -538,15 +552,22 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_RECIPROCAL:
|
case UnaryOpOperation_RECIPROCAL:
|
||||||
if(mVectorize) {
|
if(mVectorize) {
|
||||||
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
ss << inpName << ".y=(1.0/" << operand << ".y)";
|
ss << inpName << ".x=(half)(1.0/(float)" << operand << ".x);\n";
|
||||||
if(mPrecision != BackendConfig::Precision_Low) {
|
ss << inpName << ".y=(half)(1.0/(float)" << operand << ".y)";
|
||||||
|
} else {
|
||||||
|
ss << inpName << ".x=(1.0/" << operand << ".x);\n";
|
||||||
|
ss << inpName << ".y=(1.0/" << operand << ".y)";
|
||||||
ss << ";\n";
|
ss << ";\n";
|
||||||
ss << inpName << ".z=(1.0/" << operand << ".z);\n";
|
ss << inpName << ".z=(1.0/" << operand << ".z);\n";
|
||||||
ss << inpName << ".w=(1.0/" << operand << ".w)";
|
ss << inpName << ".w=(1.0/" << operand << ".w)";
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << inpName << "=(1.0/" << operand << ")";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << "=(half)(1.0/(float)" << operand << ")";
|
||||||
|
} else {
|
||||||
|
ss << inpName << "=(1.0/" << operand << ")";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_LOG:
|
case UnaryOpOperation_LOG:
|
||||||
|
@ -564,17 +585,44 @@ std::string CUDATarget::codegen(std::vector<std::string>& inputs, const Command*
|
||||||
break;
|
break;
|
||||||
case UnaryOpOperation_GELU:
|
case UnaryOpOperation_GELU:
|
||||||
if(mVectorize) {
|
if(mVectorize) {
|
||||||
ss << inpName << ".x=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
ss << inpName << ".y=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
|
ss << inpName << ".x=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".x*(float)" << operand << ".x*(float)" << operand << ".x+(float)" << operand + ".x))) * (float)" << operand << ".x* 0.5f);\n";
|
||||||
if(mPrecision != BackendConfig::Precision_Low) {
|
ss << inpName << ".y=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << ".y*(float)" << operand << ".y*(float)" << operand << ".y+(float)" << operand + ".y))) * (float)" << operand << ".y* 0.5f)";
|
||||||
|
} else {
|
||||||
|
ss << inpName << ".x=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".x*" << operand << ".x*" << operand << ".x+" << operand + ".x))) * " << operand << ".x* 0.5f);\n";
|
||||||
|
ss << inpName << ".y=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".y*" << operand << ".y*" << operand << ".y+" << operand + ".y))) * " << operand << ".y* 0.5f)";
|
||||||
ss << ";\n";
|
ss << ";\n";
|
||||||
ss << inpName << ".z=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
|
ss << inpName << ".z=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".z*" << operand << ".z*" << operand << ".z+" << operand + ".z))) * " << operand << ".z* 0.5f);\n";
|
||||||
ss << inpName << ".w=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
|
ss << inpName << ".w=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << ".w*" << operand << ".w*" << operand << ".w+" << operand + ".w))) * " << operand << ".w* 0.5f)";
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << inpName << "=(1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << "=(half)((1.0f + tanh(0.79788458f * (0.044715f * (float)" << operand << "*(float)" << operand << "*(float)" << operand << "+(float)" << operand + "))) * (float)" << operand << "* 0.5f)";
|
||||||
|
} else {
|
||||||
|
ss << inpName << "=((1.0f + tanh(0.79788458f * (0.044715f * " << operand << "*" << operand << "*" << operand << "+" << operand + "))) * " << operand << "* 0.5f)";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case UnaryOpOperation_GELU_STANDARD:
|
||||||
|
if(mVectorize) {
|
||||||
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << ".x=(half)((erf((float)" << operand << ".x*0.7071067932881648f)+1.f)*(float)" << operand << ".x*0.5f);\n";
|
||||||
|
ss << inpName << ".y=(half)((erf((float)" << operand << ".y*0.7071067932881648f)+1.f)*(float)" << operand << ".y*0.5f)";
|
||||||
|
} else {
|
||||||
|
ss << inpName << ".x=((erf(" << operand << ".x*0.7071067932881648f)+1.f)*" << operand << ".x*0.5f);\n";
|
||||||
|
ss << inpName << ".y=((erf(" << operand << ".y*0.7071067932881648f)+1.f)*" << operand << ".y*0.5f)";
|
||||||
|
ss << ";\n";
|
||||||
|
ss << inpName << ".z=((erf(" << operand << ".z*0.7071067932881648f)+1.f)*" << operand << ".z*0.5f);\n";
|
||||||
|
ss << inpName << ".w=((erf(" << operand << ".w*0.7071067932881648f)+1.f)*" << operand << ".w*0.5f)";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(mPrecision == BackendConfig::Precision_Low) {
|
||||||
|
ss << inpName << "=(half)((erf((float)" << operand << "*0.7071067932881648f)+1.f)*(float)" << operand << "*0.5f)";
|
||||||
|
} else {
|
||||||
|
ss << inpName << "=((erf(" << operand << "*0.7071067932881648f)+1.f)*" << operand << "*0.5f)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type);
|
MNN_PRINT("Error: CUDA CodeGen not support Unary type:%d\n", type);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -104,12 +104,9 @@ int main(int argc, char* argv[]) {
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
outputs = module->onForward(inputs);
|
outputs = module->onForward(inputs);
|
||||||
}
|
}
|
||||||
globalExecutor->resetProfile();
|
|
||||||
outputs = module->onForward(inputs);
|
outputs = module->onForward(inputs);
|
||||||
globalExecutor->dumpProfile();
|
|
||||||
{
|
{
|
||||||
MNN::Timer autoTime;
|
MNN::Timer autoTime;
|
||||||
globalExecutor->resetProfile();
|
|
||||||
for (int i = 0; i < benchTime; ++i) {
|
for (int i = 0; i < benchTime; ++i) {
|
||||||
MNN::AutoTime _t(0, "Once time");
|
MNN::AutoTime _t(0, "Once time");
|
||||||
// std::cout << i << std::endl;
|
// std::cout << i << std::endl;
|
||||||
|
|
|
@ -42,9 +42,7 @@ int main(int argc, const char* argv[]) {
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
{
|
{
|
||||||
AUTOTIME;
|
AUTOTIME;
|
||||||
Executor::getGlobalExecutor()->resetProfile();
|
|
||||||
outputs = model->onForward({first, second});
|
outputs = model->onForward({first, second});
|
||||||
Executor::getGlobalExecutor()->dumpProfile();
|
|
||||||
}
|
}
|
||||||
std::ostringstream fileNameOs;
|
std::ostringstream fileNameOs;
|
||||||
std::ostringstream dimInfo;
|
std::ostringstream dimInfo;
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
- warm_up_count: 预热次数
|
- warm_up_count: 预热次数
|
||||||
- forwardtype: 可选,默认是0,即CPU,forwardtype有0->CPU,1->Metal,3->OpenCL,6->OpenGL,7->Vulkan
|
- forwardtype: 可选,默认是0,即CPU,forwardtype有0->CPU,1->Metal,3->OpenCL,6->OpenGL,7->Vulkan
|
||||||
- numberThread: 可选,默认是4,为 CPU 线程数或者 GPU 的运行模式
|
- numberThread: 可选,默认是4,为 CPU 线程数或者 GPU 的运行模式
|
||||||
- precision: 可选,默认是 2 (precision_low)
|
- precision: 可选,默认是2,有效输入为:0(Normal), 1(High), 2(Low_FP16), 3(Low_BF16)
|
||||||
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
|
- weightSparsity: 可选,默认是 0.0 ,在 weightSparsity > 0.5 时且后端支持时,开启稀疏计算
|
||||||
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
|
- weightSparseBlockNumber: 可选,默认是 1 ,仅当 weightSparsity > 0.5 时生效,为稀疏计算 block 大小,越大越有利于稀疏计算的加速,一般选择 1, 4, 8, 16
|
||||||
- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试
|
- testQuantizedModel 可选,默认是0,即只测试浮点模型;取1时,会在测试浮点模型后进行量化模型的测试
|
||||||
|
|
|
@ -68,7 +68,7 @@ Avg= 5.570600 ms, OpSum = 7.059200 ms min= 3.863000 ms, max= 11.596001 ms
|
||||||
### 参数
|
### 参数
|
||||||
`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
|
`./ModuleBasic.out model dir [runMask forwardType runLoops numberThread precision_memory cacheFile]`
|
||||||
- `model:str` 模型文件路径
|
- `model:str` 模型文件路径
|
||||||
- `dir:str` 输入输出信息文件夹,可使用 fastTestOnnx.py / fastTestTf.py / fastTestTflite.py 等脚本生成,参考模型转换的正确性校验部分。
|
- `dir:str` 输入输出信息文件夹,可使用 testMNNFromTf.py / testMNNFromOnnx.py / testMNNFromTflite.py 等脚本生成,参考模型转换的正确性校验部分。
|
||||||
- `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析
|
- `runMask:int` 默认为 0 ,为一系列功能的开关,如需开启多个功能,可把对齐的 mask 值相加(不能叠加的情况另行说明),具体见下面的 runMask 参数解析
|
||||||
- `forwardType:int` 执行推理的计算设备,有效值为:0(CPU)、1(Metal)、2(CUDA)、3(OpenCL)、6(OpenGL),7(Vulkan) ,9 (TensorRT),可选,默认为`0`
|
- `forwardType:int` 执行推理的计算设备,有效值为:0(CPU)、1(Metal)、2(CUDA)、3(OpenCL)、6(OpenGL),7(Vulkan) ,9 (TensorRT),可选,默认为`0`
|
||||||
- `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试
|
- `runLoops:int` 性能测试的循环次数,可选,默认为`0`即不做性能测试
|
||||||
|
@ -456,49 +456,3 @@ Matrix:
|
||||||
0.0000000 0.0000000 1.0000000
|
0.0000000 0.0000000 1.0000000
|
||||||
```
|
```
|
||||||
|
|
||||||
## winogradGenerateCL.out
|
|
||||||
### 说明
|
|
||||||
生成winograd变换矩阵程序,并生成opencl转换代码
|
|
||||||
### 参数
|
|
||||||
`./winogradExample.out unit kernelSize`
|
|
||||||
- `unit:int` 分块大小
|
|
||||||
- `kernelSize:int` 卷积核大小
|
|
||||||
### 示例
|
|
||||||
```bash
|
|
||||||
$ ./winogradGenerateCL.out 2 2
|
|
||||||
A
|
|
||||||
1.0000000 0.0000000
|
|
||||||
1.0000000 0.5000000
|
|
||||||
0.0000000 1.0000000
|
|
||||||
B
|
|
||||||
1.0000000 0.0000000 -0.0000000
|
|
||||||
-2.0000000 2.0000000 -0.5000000
|
|
||||||
0.0000000 0.0000000 1.0000000
|
|
||||||
G
|
|
||||||
1.0000000 0.0000000
|
|
||||||
1.0000000 0.5000000
|
|
||||||
0.0000000 1.0000000
|
|
||||||
Generate winogradTransformSource2_2_0.5.cl
|
|
||||||
Generate winogradTransformDest2_2_0.5.cl
|
|
||||||
```
|
|
||||||
|
|
||||||
## winogradGenerateGLSL.out
|
|
||||||
### 说明
|
|
||||||
生成winograd变换矩阵程序,并生成opengl转换代码
|
|
||||||
### 参数
|
|
||||||
`./winogradExample.out unit kernelSize`
|
|
||||||
- `unit:int` 分块大小
|
|
||||||
- `kernelSize:int` 卷积核大小
|
|
||||||
### 示例
|
|
||||||
```bash
|
|
||||||
$ ./winogradGenerateGLSL.out 1 2
|
|
||||||
A
|
|
||||||
1.0000000
|
|
||||||
B
|
|
||||||
1.0000000 -0.0000000
|
|
||||||
0.0000000 1.0000000
|
|
||||||
G
|
|
||||||
1.0000000
|
|
||||||
Generate winogradTransformSource1_2_0.5.comp
|
|
||||||
Generate winogradTransformDest1_2_0.5.comp
|
|
||||||
```
|
|
||||||
|
|
|
@ -13,11 +13,7 @@ if(MNN_CUDA_PROFILE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
|
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
|
||||||
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
|
|
||||||
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
|
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
|
||||||
IF (MNN_EXPR_ENABLE_PROFILER)
|
|
||||||
add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
|
|
||||||
ENDIF()
|
|
||||||
IF (MNN_EXPR_SHAPE_EAGER)
|
IF (MNN_EXPR_SHAPE_EAGER)
|
||||||
add_definitions(-DMNN_EXPR_SHAPE_EAGER)
|
add_definitions(-DMNN_EXPR_SHAPE_EAGER)
|
||||||
ENDIF()
|
ENDIF()
|
||||||
|
|
|
@ -21,55 +21,9 @@
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
#ifdef MNN_EXPR_ENABLE_PROFILER
|
||||||
#define MNN_EXPRESS_ERROR_REPORT
|
#define MNN_EXPRESS_ERROR_REPORT
|
||||||
#endif
|
#endif
|
||||||
#define MNN_EXPRESS_OPEN_MEMORY_REUSE
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
namespace Express {
|
namespace Express {
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
|
||||||
class Executor::Profiler {
|
|
||||||
public:
|
|
||||||
void reset();
|
|
||||||
void dump() const;
|
|
||||||
void add(const std::string& opType, float timeInMs);
|
|
||||||
void addFlops(const std::string& opType, float flops);
|
|
||||||
private:
|
|
||||||
std::map<std::string, float> mTimes;
|
|
||||||
std::map<std::string, float> mFlops;
|
|
||||||
};
|
|
||||||
void Executor::Profiler::reset() {
|
|
||||||
mTimes.clear();
|
|
||||||
mFlops.clear();
|
|
||||||
}
|
|
||||||
void Executor::Profiler::dump() const {
|
|
||||||
float sumValue = 0.0f;
|
|
||||||
for (auto iter : mTimes) {
|
|
||||||
MNN_PRINT("%s: %f ms\n", iter.first.c_str(), iter.second);
|
|
||||||
sumValue += iter.second;
|
|
||||||
}
|
|
||||||
MNN_PRINT("Total: %f ms\n", sumValue);
|
|
||||||
sumValue = 0.0f;
|
|
||||||
for (auto iter : mFlops) {
|
|
||||||
MNN_PRINT("%s: %f \n", iter.first.c_str(), iter.second);
|
|
||||||
sumValue += iter.second;
|
|
||||||
}
|
|
||||||
MNN_PRINT("Total flops: %f M\n", sumValue);
|
|
||||||
}
|
|
||||||
void Executor::Profiler::add(const std::string& opType, float timeInMs) {
|
|
||||||
auto iter = mTimes.find(opType);
|
|
||||||
if (iter == mTimes.end()) {
|
|
||||||
mTimes[opType] = timeInMs;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
iter->second += timeInMs;
|
|
||||||
}
|
|
||||||
void Executor::Profiler::addFlops(const std::string& opType, float flops) {
|
|
||||||
auto iter = mFlops.find(opType);
|
|
||||||
if (iter == mFlops.end()) {
|
|
||||||
mFlops[opType] = flops;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
iter->second += flops;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
|
void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& config, int numberThread) {
|
||||||
std::lock_guard<std::mutex> _l(mMutex);
|
std::lock_guard<std::mutex> _l(mMutex);
|
||||||
|
@ -648,36 +602,12 @@ void Executor::makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
|
||||||
//FUNC_PRINT(mCaches.size());
|
//FUNC_PRINT(mCaches.size());
|
||||||
_makeCache(expr, forceCPU);
|
_makeCache(expr, forceCPU);
|
||||||
}
|
}
|
||||||
void Executor::addOpCostTime(int op, float costTime) {
|
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
|
||||||
auto opType = MNN::EnumNameOpType((OpType)op);
|
|
||||||
if (nullptr == opType) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
mProfiler->add(opType, costTime);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
void Executor::addOpCostTime(const std::string& type, float costTime) {
|
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
|
||||||
mProfiler->add(type, costTime);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
void Executor::addOpFlops(const std::string& type, float flops) {
|
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
|
||||||
mProfiler->addFlops(type, flops);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void Executor::resetProfile() {
|
void Executor::resetProfile() {
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
// Depercated
|
||||||
mProfiler->reset();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
void Executor::dumpProfile() {
|
void Executor::dumpProfile() {
|
||||||
#ifdef MNN_EXPR_ENABLE_PROFILER
|
// Depercated
|
||||||
mProfiler->dump();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {
|
bool Executor::registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs) {
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include "NMSModule.hpp"
|
#include "NMSModule.hpp"
|
||||||
#include "Utils.hpp"
|
#include "Utils.hpp"
|
||||||
#include "core/Backend.hpp"
|
#include "core/Backend.hpp"
|
||||||
|
#include "core/WrapExecution.hpp"
|
||||||
#include "utils/InitNet.hpp"
|
#include "utils/InitNet.hpp"
|
||||||
#include "RuntimeAttr.hpp"
|
#include "RuntimeAttr.hpp"
|
||||||
#include "geometry/GeometryComputer.hpp"
|
#include "geometry/GeometryComputer.hpp"
|
||||||
|
@ -490,7 +491,15 @@ static std::vector<SubModuleInfo> _createSubModuleInfo(std::shared_ptr<BufferSto
|
||||||
return submodule;
|
return submodule;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config& config, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, bool needGeometry) {
|
struct ModuleRuntimeConfig {
|
||||||
|
bool needGeometry;
|
||||||
|
RuntimeInfo rt;
|
||||||
|
Backend::Info compute;
|
||||||
|
const BackendConfig* userConfig = nullptr;
|
||||||
|
Session::ModeGroup modes;
|
||||||
|
};
|
||||||
|
|
||||||
|
static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs, std::shared_ptr<Schedule::ScheduleInfo> sharedConst, const Module::Config& config, const ModuleRuntimeConfig& runtimeConfig) {
|
||||||
auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
|
auto net = flatbuffers::GetRoot<Net>(bufferStorage->buffer());
|
||||||
if (1 == info.opList.size()) {
|
if (1 == info.opList.size()) {
|
||||||
auto op = net->oplists()->GetAs<Op>(info.opList[0]);
|
auto op = net->oplists()->GetAs<Op>(info.opList[0]);
|
||||||
|
@ -506,9 +515,8 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
|
||||||
// MNN_ASSERT(false);
|
// MNN_ASSERT(false);
|
||||||
}
|
}
|
||||||
Schedule::ScheduleInfo scheduleInfo;
|
Schedule::ScheduleInfo scheduleInfo;
|
||||||
RuntimeInfo rt;
|
|
||||||
Session::ModeGroup modes;
|
|
||||||
scheduleInfo.defaultBackend = sharedConst->defaultBackend;
|
scheduleInfo.defaultBackend = sharedConst->defaultBackend;
|
||||||
|
scheduleInfo.constReplaceBackend = sharedConst->constReplaceBackend;
|
||||||
scheduleInfo.allTensors = sharedConst->allTensors;
|
scheduleInfo.allTensors = sharedConst->allTensors;
|
||||||
initTensors(scheduleInfo.allTensors, net);
|
initTensors(scheduleInfo.allTensors, net);
|
||||||
std::vector<Schedule::OpCacheInfo> oplists;
|
std::vector<Schedule::OpCacheInfo> oplists;
|
||||||
|
@ -522,34 +530,19 @@ static Module* _createSubModule(std::shared_ptr<BufferStorage> bufferStorage, co
|
||||||
if (breakIndex >= 0) {
|
if (breakIndex >= 0) {
|
||||||
scheduleInfo.needInputContentForShape = true;
|
scheduleInfo.needInputContentForShape = true;
|
||||||
}
|
}
|
||||||
Backend::Info compute;
|
auto rt = runtimeConfig.rt;
|
||||||
const BackendConfig* userConfig = nullptr;
|
auto modes = runtimeConfig.modes;
|
||||||
if (nullptr == rtMgr) {
|
|
||||||
rt = Executor::getRuntime();
|
|
||||||
auto glo = ExecutorScope::Current();
|
|
||||||
compute.type = glo->getAttr()->firstType.first;
|
|
||||||
compute.numThread = glo->getAttr()->firstType.second;
|
|
||||||
} else {
|
|
||||||
modes = rtMgr->getInside()->modes;
|
|
||||||
rt = rtMgr->getInside()->mRuntime;
|
|
||||||
userConfig = &rtMgr->getInside()->mConfig;
|
|
||||||
compute.type = rt.first.begin()->first;
|
|
||||||
compute.numThread = 1;
|
|
||||||
// set external file info
|
|
||||||
if (!rtMgr->getInside()->mExternalFile.empty()) {
|
|
||||||
rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
|
||||||
rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Schedule::BackendCache bnCache;
|
Schedule::BackendCache bnCache;
|
||||||
if (nullptr != userConfig) {
|
Backend::Info compute = runtimeConfig.compute;
|
||||||
bnCache.config = *userConfig;
|
if (nullptr != runtimeConfig.userConfig) {
|
||||||
|
bnCache.config = *runtimeConfig.userConfig;
|
||||||
compute.user = &bnCache.config;
|
compute.user = &bnCache.config;
|
||||||
} else {
|
} else {
|
||||||
compute.user = nullptr;
|
compute.user = nullptr;
|
||||||
}
|
}
|
||||||
bnCache.info = std::move(compute);
|
bnCache.info = std::move(compute);
|
||||||
bnCache.needComputeGeometry = needGeometry;
|
bnCache.needComputeGeometry = runtimeConfig.needGeometry;
|
||||||
|
|
||||||
scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists)));
|
scheduleInfo.pipelineInfo.emplace_back(std::make_pair(std::move(bnCache), std::move(oplists)));
|
||||||
|
|
||||||
std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage};
|
std::vector<std::shared_ptr<BufferStorage>> buffers = {bufferStorage};
|
||||||
|
@ -588,13 +581,38 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
||||||
// Extra Const Tensors
|
// Extra Const Tensors
|
||||||
sharedConst.reset(new Schedule::ScheduleInfo);
|
sharedConst.reset(new Schedule::ScheduleInfo);
|
||||||
auto curExe = ExecutorScope::Current();
|
auto curExe = ExecutorScope::Current();
|
||||||
|
bool permitCodeGen = false;
|
||||||
if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) {
|
if (rtMgr && !rtMgr->getInside()->mExternalFile.empty()) {
|
||||||
curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
curExe->getRuntime().second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||||
|
permitCodeGen = rtMgr->getInside()->modes.codegenMode == Interpreter::Session_Codegen_Enable;
|
||||||
}
|
}
|
||||||
std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
|
std::shared_ptr<Backend> defaultBackend = curExe->getAttr()->constantBackend;
|
||||||
std::vector<std::shared_ptr<Tensor>> allTensors;
|
std::vector<std::shared_ptr<Tensor>> allTensors;
|
||||||
sharedConst->allTensors.resize(net->tensorName()->size());
|
sharedConst->allTensors.resize(net->tensorName()->size());
|
||||||
sharedConst->defaultBackend = defaultBackend;
|
sharedConst->defaultBackend = defaultBackend;
|
||||||
|
std::shared_ptr<ModuleRuntimeConfig> modRuntimeCfgPtr(new ModuleRuntimeConfig);
|
||||||
|
ModuleRuntimeConfig& modRuntime = *modRuntimeCfgPtr;
|
||||||
|
modRuntime.needGeometry = needGeometry;
|
||||||
|
if (nullptr == rtMgr) {
|
||||||
|
modRuntime.rt = Executor::getRuntime();
|
||||||
|
auto glo = ExecutorScope::Current();
|
||||||
|
modRuntime.compute.type = glo->getAttr()->firstType.first;
|
||||||
|
modRuntime.compute.numThread = glo->getAttr()->firstType.second;
|
||||||
|
} else {
|
||||||
|
modRuntime.modes = rtMgr->getInside()->modes;
|
||||||
|
modRuntime.rt = rtMgr->getInside()->mRuntime;
|
||||||
|
modRuntime.userConfig = &rtMgr->getInside()->mConfig;
|
||||||
|
modRuntime.compute.type = modRuntime.rt.first.begin()->first;
|
||||||
|
modRuntime.compute.numThread = 1;
|
||||||
|
// set external file info
|
||||||
|
if (!rtMgr->getInside()->mExternalFile.empty()) {
|
||||||
|
modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||||
|
modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto& rt = modRuntime.rt;
|
||||||
|
auto firstRt = rt.first[modRuntime.compute.type];
|
||||||
|
sharedConst->constReplaceBackend.reset(firstRt->onCreate(modRuntime.userConfig));
|
||||||
ErrorCode code = NO_ERROR;
|
ErrorCode code = NO_ERROR;
|
||||||
std::set<int> noneedComputeIndexes;
|
std::set<int> noneedComputeIndexes;
|
||||||
initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code);
|
initConstTensors(sharedConst->allTensors, net, defaultBackend.get(), code);
|
||||||
|
@ -646,7 +664,7 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
||||||
auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
|
auto subModulesInfo = _createSubModuleInfo(bufferStorage, inputIndexes, outputIndexes, noneedComputeIndexes, sharedConst);
|
||||||
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
|
std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
|
||||||
for (int i=0; i<subModulesInfo.size(); ++i) {
|
for (int i=0; i<subModulesInfo.size(); ++i) {
|
||||||
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, rtMgr, *config, sharedConst, needGeometry));
|
subModules[i].reset(_createSubModule(bufferStorage, subModulesInfo[i], subGraphMap, sharedConst, *config, modRuntime));
|
||||||
}
|
}
|
||||||
auto result = new PipelineModule;
|
auto result = new PipelineModule;
|
||||||
result->mInputSize = inputs.size();
|
result->mInputSize = inputs.size();
|
||||||
|
@ -702,8 +720,45 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
|
||||||
}
|
}
|
||||||
result->registerModel(subModules);
|
result->registerModel(subModules);
|
||||||
result->mSharedConst = sharedConst;
|
result->mSharedConst = sharedConst;
|
||||||
|
if (!permitCodeGen) {
|
||||||
|
// Prereplace const tensor
|
||||||
|
auto curBackend = sharedConst->constReplaceBackend.get();
|
||||||
|
if (sharedConst->constReplaceBackend->type() != sharedConst->defaultBackend->type()) {
|
||||||
|
for (auto& t : sharedConst->allTensors) {
|
||||||
|
if (nullptr == t.get()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
auto des = TensorUtils::getDescribe(t.get());
|
||||||
|
if (des->isMutable) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!WrapExecution::needWrap(t.get(), curBackend)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (des->stageMask & Tensor::InsideDescribe::GEOMETRY_STAGE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (des->stageMask & Tensor::InsideDescribe::CONVERTED_STAGE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::shared_ptr<Tensor> wrapTensor = WrapExecution::makeCopyTensor(t.get(), curBackend);
|
||||||
|
auto outDes = TensorUtils::getDescribe(wrapTensor.get());
|
||||||
|
outDes->usage = des->usage;
|
||||||
|
auto tempRes = curBackend->onAcquireBuffer(wrapTensor.get(), Backend::STATIC);
|
||||||
|
if (!tempRes) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
outDes->setBackend(curBackend);
|
||||||
|
curBackend->onCopyBuffer(t.get(), wrapTensor.get());
|
||||||
|
outDes->stageMask |= Tensor::InsideDescribe::CONVERTED_STAGE;
|
||||||
|
TensorUtils::getDescribeOrigin(t.get())->mContent = TensorUtils::getDescribeOrigin(wrapTensor.get())->mContent;
|
||||||
|
t->buffer().host = wrapTensor->buffer().host;
|
||||||
|
t->buffer().device = wrapTensor->buffer().device;
|
||||||
|
t->buffer().dim = TensorUtils::getDescribe(wrapTensor.get())->dims;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Module* PipelineModule::clone(CloneContext* ctx) const {
|
Module* PipelineModule::clone(CloneContext* ctx) const {
|
||||||
|
|
|
@ -430,6 +430,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
||||||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
|
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
|
||||||
} else if (backend == mResource->mSharedConst->defaultBackend.get()) {
|
} else if (backend == mResource->mSharedConst->defaultBackend.get()) {
|
||||||
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
|
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->defaultBackend;
|
||||||
|
} else if (backend == mResource->mSharedConst->constReplaceBackend.get()) {
|
||||||
|
outputs[mResource->mOutputFromTensor[i]]->expr().first->inside()->mHoldBackend = mResource->mSharedConst->constReplaceBackend;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -195,6 +195,7 @@ public:
|
||||||
MAX_TUNING_NUMBER = 0,
|
MAX_TUNING_NUMBER = 0,
|
||||||
// Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid
|
// Strictly check model file or not, default 1. if set 0, will not check model file valid/invalid
|
||||||
STRICT_CHECK_MODEL = 1,
|
STRICT_CHECK_MODEL = 1,
|
||||||
|
MEM_ALLOCATOR_TYPE = 2,
|
||||||
};
|
};
|
||||||
/**
|
/**
|
||||||
* @brief The API shoud be called before create session.
|
* @brief The API shoud be called before create session.
|
||||||
|
|
|
@ -68,7 +68,7 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
||||||
#define STR_IMP(x) #x
|
#define STR_IMP(x) #x
|
||||||
#define STR(x) STR_IMP(x)
|
#define STR(x) STR_IMP(x)
|
||||||
#define MNN_VERSION_MAJOR 2
|
#define MNN_VERSION_MAJOR 2
|
||||||
#define MNN_VERSION_MINOR 6
|
#define MNN_VERSION_MINOR 7
|
||||||
#define MNN_VERSION_PATCH 3
|
#define MNN_VERSION_PATCH 0
|
||||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||||
#endif /* MNNDefine_h */
|
#endif /* MNNDefine_h */
|
||||||
|
|
|
@ -68,11 +68,6 @@ public:
|
||||||
struct SubGraph;
|
struct SubGraph;
|
||||||
bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs);
|
bool registerSubGraph(const std::string& submoduleName, VARPS outputs, VARPS inputs);
|
||||||
std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName);
|
std::shared_ptr<SubGraph> findSubGraph(const std::string& submoduleName);
|
||||||
/**Internal Usage Begin*/
|
|
||||||
void addOpCostTime(int op, float costTime);
|
|
||||||
void addOpCostTime(const std::string& type, float costTime);
|
|
||||||
void addOpFlops(const std::string& type, float flops);
|
|
||||||
/**Internal Usage End*/
|
|
||||||
static RuntimeInfo getRuntime();
|
static RuntimeInfo getRuntime();
|
||||||
void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after);
|
void setCallBack(TensorCallBackWithInfo&& before, TensorCallBackWithInfo&& after);
|
||||||
const DebugTools* getDebugTools() const {
|
const DebugTools* getDebugTools() const {
|
||||||
|
|
|
@ -50,7 +50,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
|
||||||
}
|
}
|
||||||
|
|
||||||
CPURuntime::CPURuntime(const Backend::Info& info) {
|
CPURuntime::CPURuntime(const Backend::Info& info) {
|
||||||
mStaticAllocator.reset(new BufferAllocator(BufferAllocator::Allocator::createDefault()));
|
mStaticAllocator.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createDefault()));
|
||||||
mThreadNumber = info.numThread;
|
mThreadNumber = info.numThread;
|
||||||
mThreadNumber = std::max(1, mThreadNumber);
|
mThreadNumber = std::max(1, mThreadNumber);
|
||||||
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
|
||||||
|
@ -64,6 +64,7 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
|
||||||
mMemory = info.user->memory;
|
mMemory = info.user->memory;
|
||||||
mFlags = info.user->flags;
|
mFlags = info.user->flags;
|
||||||
}
|
}
|
||||||
|
mAllocator = info.allocator;
|
||||||
|
|
||||||
#ifdef _OPENMP
|
#ifdef _OPENMP
|
||||||
switch (mPower) {
|
switch (mPower) {
|
||||||
|
@ -218,7 +219,11 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
|
||||||
mMemory = memory;
|
mMemory = memory;
|
||||||
mRuntime = const_cast<CPURuntime*>(runtime);
|
mRuntime = const_cast<CPURuntime*>(runtime);
|
||||||
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
|
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
|
||||||
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
|
if (mRuntime->getAllocatorType() == Runtime::Allocator_Defer) {
|
||||||
|
mDynamicAllocator.reset(new DeferBufferAllocator(defaultAlloc));
|
||||||
|
} else {
|
||||||
|
mDynamicAllocator.reset(new EagerBufferAllocator(defaultAlloc));
|
||||||
|
}
|
||||||
mStaticAllocator = runtime->mStaticAllocator;
|
mStaticAllocator = runtime->mStaticAllocator;
|
||||||
mPrecisionMode = precision;
|
mPrecisionMode = precision;
|
||||||
mCoreFunctions = MNNGetCoreFunctions();
|
mCoreFunctions = MNNGetCoreFunctions();
|
||||||
|
@ -238,24 +243,14 @@ void CPUBackend::onExecuteEnd() const {
|
||||||
mRuntime->onConcurrencyEnd();
|
mRuntime->onConcurrencyEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
class CPUMemObj : public Backend::MemObj {
|
void CPUBackend::onResizeBegin() {
|
||||||
public:
|
mDynamicAllocator->reset();
|
||||||
CPUMemObj(BufferAllocator* allocator, std::pair<void*, int> points, int size) {
|
}
|
||||||
mPoint = std::move(points);
|
|
||||||
mAllocator = allocator;
|
void CPUBackend::onResizeEnd() {
|
||||||
mSize = size;
|
getCache()->release();
|
||||||
}
|
mDynamicAllocator->compute();
|
||||||
virtual ~ CPUMemObj() {
|
}
|
||||||
mAllocator->free(mPoint);
|
|
||||||
}
|
|
||||||
inline int getSize() const {
|
|
||||||
return mSize;
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
BufferAllocator* mAllocator;
|
|
||||||
std::pair<void*, int> mPoint;
|
|
||||||
int mSize;
|
|
||||||
};
|
|
||||||
|
|
||||||
Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
|
Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType storageType) {
|
||||||
auto originMem = TensorUtils::getDescribe(dest)->mem.get();
|
auto originMem = TensorUtils::getDescribe(dest)->mem.get();
|
||||||
|
@ -277,35 +272,41 @@ Backend::MemObj* CPUBackend::allocBuffer(int size, Tensor* dest, StorageType sto
|
||||||
// }
|
// }
|
||||||
auto& buffer = dest->buffer();
|
auto& buffer = dest->buffer();
|
||||||
auto des = TensorUtils::getDescribe(dest);
|
auto des = TensorUtils::getDescribe(dest);
|
||||||
std::pair<void*, int> points;
|
MemChunk chunk;
|
||||||
switch (storageType) {
|
switch (storageType) {
|
||||||
case STATIC: {
|
case STATIC: {
|
||||||
points = mStaticAllocator->alloc(size, false);
|
chunk = mStaticAllocator->alloc(size, false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case DYNAMIC: {
|
case DYNAMIC: {
|
||||||
points = mDynamicAllocator->alloc(size, false);
|
chunk = mDynamicAllocator->alloc(size, false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case DYNAMIC_SEPERATE: {
|
case DYNAMIC_SEPERATE: {
|
||||||
points = mDynamicAllocator->alloc(size, true);
|
chunk = mDynamicAllocator->alloc(size, true);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(false);
|
MNN_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (nullptr == points.first) {
|
|
||||||
|
if (chunk.invalid()) {
|
||||||
MNN_ERROR("Alloc buffer error for cpu backend\n");
|
MNN_ERROR("Alloc buffer error for cpu backend\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
Backend::MemObj* res = nullptr;
|
Backend::MemObj* res = nullptr;
|
||||||
|
|
||||||
if (storageType == STATIC) {
|
if (storageType == STATIC) {
|
||||||
res = new CPUMemObj(mStaticAllocator.get(), points, size);
|
res = new CPUMemObj(mStaticAllocator.get(), chunk, size);
|
||||||
} else {
|
} else {
|
||||||
res = new CPUMemObj(mDynamicAllocator.get(), points, size);
|
res = new CPUMemObj(mDynamicAllocator.get(), chunk, size);
|
||||||
|
chunk.attach(dest);
|
||||||
|
}
|
||||||
|
if (chunk.ptr()) {
|
||||||
|
buffer.host = chunk.ptr();
|
||||||
}
|
}
|
||||||
buffer.host = (uint8_t*)points.first + points.second;
|
|
||||||
des->extra.offset = 0;
|
des->extra.offset = 0;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,10 +13,10 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include "core/Backend.hpp"
|
#include "core/Backend.hpp"
|
||||||
#include "core/Execution.hpp"
|
#include "core/Execution.hpp"
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
#include "MNN_generated.h"
|
#include "MNN_generated.h"
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
class BufferAllocator;
|
|
||||||
class CPURuntime : public Runtime {
|
class CPURuntime : public Runtime {
|
||||||
public:
|
public:
|
||||||
friend class CPUBackend;
|
friend class CPUBackend;
|
||||||
|
@ -35,7 +35,7 @@ public:
|
||||||
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
||||||
int mThreadNumber;
|
int mThreadNumber;
|
||||||
mutable int mTaskIndex;
|
mutable int mTaskIndex;
|
||||||
BackendConfig::MemoryMode mMemory;
|
BackendConfig::MemoryMode mMemory;
|
||||||
|
@ -47,11 +47,31 @@ private:
|
||||||
float mFlops = 0.0f;
|
float mFlops = 0.0f;
|
||||||
static Backend*(*gExtraCreate)(const Runtime* runtime);
|
static Backend*(*gExtraCreate)(const Runtime* runtime);
|
||||||
size_t mFlags = 0;
|
size_t mFlags = 0;
|
||||||
|
int mAllocator = 0;
|
||||||
};
|
};
|
||||||
struct CoreFunctions;
|
struct CoreFunctions;
|
||||||
struct CoreInt8Functions;
|
struct CoreInt8Functions;
|
||||||
|
|
||||||
class CPUResizeCache;
|
class CPUResizeCache;
|
||||||
|
class CPUMemObj : public Backend::MemObj {
|
||||||
|
public:
|
||||||
|
CPUMemObj(BufferAllocator* allocator, MemChunk chunk, int size) : mAllocator(allocator), mChunk(chunk), mSize(size) {}
|
||||||
|
virtual ~ CPUMemObj() {
|
||||||
|
if (mAllocator) {
|
||||||
|
mAllocator->free(mChunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
virtual MemChunk chunk() {
|
||||||
|
return mChunk;
|
||||||
|
}
|
||||||
|
inline int getSize() const {
|
||||||
|
return mSize;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
BufferAllocator* mAllocator;
|
||||||
|
MemChunk mChunk;
|
||||||
|
int mSize;
|
||||||
|
};
|
||||||
class CPUBackend : public Backend {
|
class CPUBackend : public Backend {
|
||||||
public:
|
public:
|
||||||
CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0);
|
CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, BackendConfig::MemoryMode memory, MNNForwardType type = MNN_FORWARD_CPU, size_t flags = 0);
|
||||||
|
@ -69,6 +89,9 @@ public:
|
||||||
|
|
||||||
virtual void onExecuteBegin() const override;
|
virtual void onExecuteBegin() const override;
|
||||||
virtual void onExecuteEnd() const override;
|
virtual void onExecuteEnd() const override;
|
||||||
|
|
||||||
|
virtual void onResizeBegin() override;
|
||||||
|
virtual void onResizeEnd() override;
|
||||||
|
|
||||||
const CoreFunctions* functions() const {
|
const CoreFunctions* functions() const {
|
||||||
return mCoreFunctions;
|
return mCoreFunctions;
|
||||||
|
@ -91,7 +114,7 @@ public:
|
||||||
return mRuntime->mThreadNumber;
|
return mRuntime->mThreadNumber;
|
||||||
}
|
}
|
||||||
|
|
||||||
BufferAllocator* getBufferAllocator() const {
|
BufferAllocator* getBufferAllocator(bool defer_allocator = true) const {
|
||||||
return mDynamicAllocator.get();
|
return mDynamicAllocator.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,7 +143,7 @@ protected:
|
||||||
const CoreFunctions* mCoreFunctions;
|
const CoreFunctions* mCoreFunctions;
|
||||||
const CoreInt8Functions* mInt8CoreFunctions;
|
const CoreInt8Functions* mInt8CoreFunctions;
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
|
||||||
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
||||||
CPURuntime* mRuntime;
|
CPURuntime* mRuntime;
|
||||||
BackendConfig::PrecisionMode mPrecisionMode;
|
BackendConfig::PrecisionMode mPrecisionMode;
|
||||||
|
|
|
@ -208,9 +208,9 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
auto biasP = inputs[2]->host<uint8_t>();
|
|
||||||
auto weightP = inputs[1]->host<uint8_t>();
|
|
||||||
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
|
mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
|
||||||
|
auto biasP = inputs[2]->host<uint8_t>();
|
||||||
|
auto weightP = inputs[1]->host<uint8_t>();
|
||||||
for (int index = tId; index < total; index += numberThread) {
|
for (int index = tId; index < total; index += numberThread) {
|
||||||
int dz = index / batch;
|
int dz = index / batch;
|
||||||
auto dst_z = dstOrigin + dst_z_step * index * bytes;
|
auto dst_z = dstOrigin + dst_z_step * index * bytes;
|
||||||
|
|
|
@ -241,6 +241,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
||||||
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
CPUDeconvolutionBasic::onResize(inputs, outputs);
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
|
auto gcore = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||||
|
int bytes = core->bytes;
|
||||||
auto input = inputs[0];
|
auto input = inputs[0];
|
||||||
auto output = outputs[0];
|
auto output = outputs[0];
|
||||||
auto oc = output->channel();
|
auto oc = output->channel();
|
||||||
|
@ -270,6 +271,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
||||||
mPostFunctions.clear();
|
mPostFunctions.clear();
|
||||||
auto plane = width * height * batch;
|
auto plane = width * height * batch;
|
||||||
const int maxDepth = 5;
|
const int maxDepth = 5;
|
||||||
|
auto allocator = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||||
//int zeroPoint = 0;
|
//int zeroPoint = 0;
|
||||||
|
|
||||||
auto biasPtr = inputs[2]->host<float>();
|
auto biasPtr = inputs[2]->host<float>();
|
||||||
|
@ -284,6 +286,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
||||||
auto zeroPoint = outputQuant[1];
|
auto zeroPoint = outputQuant[1];
|
||||||
|
|
||||||
AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
|
AutoRelease<Tensor> tempInput(Tensor::createDevice<float>({icC4, plane, core->pack}));
|
||||||
|
bool needReleaseTempInput = true;
|
||||||
int outi8 = 0;
|
int outi8 = 0;
|
||||||
if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
|
if (CPUBackend::getDataType(output) == DataType_DT_INT8 || output->getType().bytes() == 1) {
|
||||||
outi8 = 1;
|
outi8 = 1;
|
||||||
|
@ -306,28 +309,28 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
|
mMatMul.reset(new StrassenMatrixComputor(backend(), true, maxDepth));
|
||||||
tempInput->buffer().host = (uint8_t*)inputPtr;
|
// tempInput->buffer().host = (uint8_t*)inputPtr;
|
||||||
|
|
||||||
|
needReleaseTempInput = false;
|
||||||
|
TensorUtils::getDescribe(tempInput.get())->mem.reset(new CPUMemObj(nullptr, TensorUtils::getDescribe(input)->mem->chunk(), 0));
|
||||||
mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
|
mMatMul->onEncode({tempInput.get(), inputs[1]}, {mTempOutput.get()});
|
||||||
}
|
}
|
||||||
auto colBufferPtr = mTempOutput->host<uint8_t>();
|
|
||||||
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
auto threadNumber = ((CPUBackend*)backend())->threadNumber();
|
||||||
std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
|
std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
|
||||||
|
auto outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
|
||||||
std::shared_ptr<Tensor> OutputFloat(Tensor::createDevice<float>({batch, src_height, src_width, ocC4 * core->pack}));
|
if (outputFp32Ptr.invalid()) {
|
||||||
auto res = backend()->onAcquireBuffer(OutputFloat.get(), Backend::DYNAMIC);
|
|
||||||
if (!res) {
|
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
auto outputFp32Ptr = OutputFloat->host<uint8_t>();
|
|
||||||
|
|
||||||
mPostFunctions.emplace_back(std::make_pair([colBufferPtr, ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
|
mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
|
||||||
strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales,
|
strideX, threadNumber, src_width, src_height, plane, biasPtr, this, core, gcore, batch, outi8, scales,
|
||||||
minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
|
minValue, maxValue, zeroPoint, outputFp32Ptr](uint8_t* outputPtr, int tId) {
|
||||||
|
auto colBufferPtr = mTempOutput->host<uint8_t>();
|
||||||
auto unitBytes = core->pack * core->bytes;
|
auto unitBytes = core->pack * core->bytes;
|
||||||
auto tempOutPtr = outputPtr;
|
auto tempOutPtr = outputPtr;
|
||||||
auto float2Int8_step = src_height * src_width * batch;
|
auto float2Int8_step = src_height * src_width * batch;
|
||||||
if (outi8) {
|
if (outi8) {
|
||||||
tempOutPtr = outputFp32Ptr;
|
tempOutPtr = outputFp32Ptr.ptr();
|
||||||
}
|
}
|
||||||
for (int z = (tId); z < ocC4; z += threadNumber) {
|
for (int z = (tId); z < ocC4; z += threadNumber) {
|
||||||
auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
|
auto dstZ = tempOutPtr + z * src_height * src_width * batch * unitBytes;
|
||||||
|
@ -367,7 +370,16 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}, threadNumber));
|
}, threadNumber));
|
||||||
if (tempInput->host<float>() != inputPtr) {
|
/*
|
||||||
|
if (TensorUtils::getDescribe(tempInput.get())->mem->chunk().offset() != TensorUtils::getDescribe(input)->mem->chunk().offset()) {
|
||||||
|
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
if (tempInput->host<float>() != inputPtr) {
|
||||||
|
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
allocator->free(outputFp32Ptr);
|
||||||
|
if (needReleaseTempInput) {
|
||||||
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
|
||||||
}
|
}
|
||||||
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
|
||||||
|
|
|
@ -7,51 +7,26 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include "backend/cpu/CPULayerNorm.hpp"
|
||||||
|
#include "backend/cpu/CPUBackend.hpp"
|
||||||
|
#include "backend/cpu/compute/CommonOptFunction.h"
|
||||||
#include "core/Execution.hpp"
|
#include "core/Execution.hpp"
|
||||||
#include "core/Concurrency.h"
|
#include "core/Concurrency.h"
|
||||||
#include "core/OpCommonUtils.hpp"
|
#include "core/OpCommonUtils.hpp"
|
||||||
#include "backend/cpu/CPUBackend.hpp"
|
|
||||||
#include "backend/cpu/compute/CommonOptFunction.h"
|
|
||||||
#include "MNN_generated.h"
|
#include "MNN_generated.h"
|
||||||
|
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
|
|
||||||
class CPULayerNorm : public Execution {
|
|
||||||
public:
|
|
||||||
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
|
|
||||||
virtual ~CPULayerNorm();
|
|
||||||
|
|
||||||
ErrorCode onExecute(const std::vector<Tensor*> &inputs, // NOLINT
|
|
||||||
const std::vector<Tensor*> &outputs) override;
|
|
||||||
|
|
||||||
ErrorCode onResize(const std::vector<Tensor*> &inputs, // NOLINT
|
|
||||||
const std::vector<Tensor*> &outputs) override;
|
|
||||||
private:
|
|
||||||
bool allocGammaBeta(int size);
|
|
||||||
private:
|
|
||||||
int axis_size = 0;
|
|
||||||
int inner_size_ = 1;
|
|
||||||
int outter_size_ = 1;
|
|
||||||
int group_ = 1;
|
|
||||||
float epsilon_ = 0.001;
|
|
||||||
|
|
||||||
std::unique_ptr<Tensor> gamma_;
|
|
||||||
std::unique_ptr<Tensor> beta_;
|
|
||||||
bool has_gamma_beta_ = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
bool CPULayerNorm::allocGammaBeta(int size) {
|
bool CPULayerNorm::allocGammaBeta(int size) {
|
||||||
has_gamma_beta_ = true;
|
mIniGammaBeta = true;
|
||||||
gamma_.reset(Tensor::createDevice<float>({size}));
|
mGamma.reset(Tensor::createDevice<float>({size}));
|
||||||
auto status = backend()->onAcquireBuffer(gamma_.get(), Backend::STATIC);
|
auto status = backend()->onAcquireBuffer(mGamma.get(), Backend::STATIC);
|
||||||
if (!status) {
|
if (!status) {
|
||||||
MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
|
MNN_ERROR("Out of memory when gamma is acquired in CPULayerNorm.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
beta_.reset(Tensor::createDevice<float>({size}));
|
mBeta.reset(Tensor::createDevice<float>({size}));
|
||||||
status = backend()->onAcquireBuffer(beta_.get(), Backend::STATIC);
|
status = backend()->onAcquireBuffer(mBeta.get(), Backend::STATIC);
|
||||||
if (!status) {
|
if (!status) {
|
||||||
MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n");
|
MNN_ERROR("Out of memory when beta is acquired in CPULayerNorm.\n");
|
||||||
return false;
|
return false;
|
||||||
|
@ -59,17 +34,16 @@ bool CPULayerNorm::allocGammaBeta(int size) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
|
CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend) : Execution(backend) {
|
||||||
: Execution(backend) {
|
|
||||||
const auto* layer_norm_param = op->main_as_LayerNorm();
|
const auto* layer_norm_param = op->main_as_LayerNorm();
|
||||||
axis_size = layer_norm_param->axis()->size();
|
mAxis = layer_norm_param->axis()->size();
|
||||||
group_ = layer_norm_param->group();
|
mGroup = layer_norm_param->group();
|
||||||
epsilon_ = layer_norm_param->epsilon();
|
mEpsilon = layer_norm_param->epsilon();
|
||||||
|
|
||||||
if (USE_EXTERNAL_DATA(layer_norm_param)) {
|
if (USE_EXTERNAL_DATA(layer_norm_param)) {
|
||||||
auto size = layer_norm_param->external()->Get(1);
|
int32_t size = static_cast<int32_t>(layer_norm_param->external()->Get(1));
|
||||||
allocGammaBeta(size);
|
allocGammaBeta(size);
|
||||||
OpCommonUtils::loadExternalDatas(backend, {gamma_->host<char>(), beta_->host<char>()}, layer_norm_param->external()->data());
|
OpCommonUtils::loadExternalDatas(backend, {mGamma->host<char>(), mBeta->host<char>()}, layer_norm_param->external()->data());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,23 +54,44 @@ CPULayerNorm::CPULayerNorm(const MNN::Op* op, Backend* backend)
|
||||||
}
|
}
|
||||||
allocGammaBeta(size);
|
allocGammaBeta(size);
|
||||||
const float* gamma_data = layer_norm_param->gamma()->data();
|
const float* gamma_data = layer_norm_param->gamma()->data();
|
||||||
memcpy(gamma_->host<float>(), gamma_data, size * sizeof(float));
|
memcpy(mGamma->host<float>(), gamma_data, size * sizeof(float));
|
||||||
const float* beta_data = layer_norm_param->beta()->data();
|
const float* beta_data = layer_norm_param->beta()->data();
|
||||||
memcpy(beta_->host<float>(), beta_data, size * sizeof(float));
|
memcpy(mBeta->host<float>(), beta_data, size * sizeof(float));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
||||||
const std::vector<Tensor*> &outputs) {
|
const std::vector<Tensor*> &outputs) {
|
||||||
const float* gamma = has_gamma_beta_ ? gamma_->host<float>() : nullptr;
|
const float* gamma = mIniGammaBeta ? mGamma->host<float>() : nullptr;
|
||||||
const float* beta = has_gamma_beta_ ? beta_->host<float>() : nullptr;
|
const float* beta = mIniGammaBeta ? mBeta->host<float>() : nullptr;
|
||||||
|
|
||||||
|
if (mInpZero.data()) {
|
||||||
|
auto core = static_cast<CPUBackend*>(backend())->int8Functions();
|
||||||
|
|
||||||
|
const int8_t* input = inputs[0]->host<int8_t>();
|
||||||
|
int8_t* output = outputs[0]->host<int8_t>();
|
||||||
|
MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
|
||||||
|
QuanPrePostParameters params;
|
||||||
|
params.maxValue = mMaxMinValue[0];
|
||||||
|
params.minValue = mMaxMinValue[1];
|
||||||
|
params.inputScale = mInpScale.data();
|
||||||
|
params.outputScale = mOutScale.data();
|
||||||
|
params.inputZeroPoint = mInpZero.data();
|
||||||
|
params.outputZeroPoint = mOutZero.data();
|
||||||
|
const int8_t* inner_input = input + tId * mInnerSize;
|
||||||
|
int8_t* inner_output = output + tId * mInnerSize;
|
||||||
|
core->MNNNormInt8(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize, ¶ms);
|
||||||
|
}
|
||||||
|
MNN_CONCURRENCY_END();
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
const float* input = inputs.at(0)->host<float>();
|
const float* input = inputs.at(0)->host<float>();
|
||||||
float* output = outputs.at(0)->host<float>();
|
float* output = outputs.at(0)->host<float>();
|
||||||
MNN_CONCURRENCY_BEGIN(tId, outter_size_) {
|
MNN_CONCURRENCY_BEGIN(tId, mOutterSize) {
|
||||||
const float* inner_input = input + tId * inner_size_;
|
const float* inner_input = input + tId * mInnerSize;
|
||||||
float* inner_output = output + tId * inner_size_;
|
float* inner_output = output + tId * mInnerSize;
|
||||||
MNNNorm(inner_output, inner_input, gamma, beta, epsilon_, inner_size_);
|
MNNNorm(inner_output, inner_input, gamma, beta, mEpsilon, mInnerSize);
|
||||||
}
|
}
|
||||||
MNN_CONCURRENCY_END();
|
MNN_CONCURRENCY_END();
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
|
@ -104,40 +99,53 @@ ErrorCode CPULayerNorm::onExecute(const std::vector<Tensor*> &inputs,
|
||||||
|
|
||||||
ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
|
ErrorCode CPULayerNorm::onResize(const std::vector<Tensor*> &inputs,
|
||||||
const std::vector<Tensor*> &outputs) {
|
const std::vector<Tensor*> &outputs) {
|
||||||
outter_size_ = 1;
|
mOutterSize = 1;
|
||||||
inner_size_ = 1;
|
mInnerSize = 1;
|
||||||
int rank = inputs.at(0)->dimensions();
|
int rank = inputs.at(0)->dimensions();
|
||||||
if (group_ > 1) {
|
if (mGroup > 1) {
|
||||||
outter_size_ = inputs.at(0)->length(0) * group_;
|
mOutterSize = inputs.at(0)->length(0) * mGroup;
|
||||||
for (int i = 1; i < rank; i++) {
|
for (int i = 1; i < rank; i++) {
|
||||||
inner_size_ *= inputs.at(0)->length(i);
|
mInnerSize *= inputs.at(0)->length(i);
|
||||||
}
|
}
|
||||||
inner_size_ /= group_;
|
mInnerSize /= mGroup;
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < rank - axis_size; ++i) {
|
for (int i = 0; i < rank - mAxis; ++i) {
|
||||||
outter_size_ *= inputs.at(0)->length(i);
|
mOutterSize *= inputs.at(0)->length(i);
|
||||||
}
|
}
|
||||||
for (int i = rank - axis_size; i < rank; ++i) {
|
for (int i = rank - mAxis; i < rank; ++i) {
|
||||||
inner_size_ *= inputs.at(0)->length(i);
|
mInnerSize *= inputs.at(0)->length(i);
|
||||||
|
}
|
||||||
|
if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
|
||||||
|
mInpZero.resize(1);
|
||||||
|
mOutZero.resize(1);
|
||||||
|
mInpScale.resize(1);
|
||||||
|
mOutScale.resize(1);
|
||||||
|
mMaxMinValue.resize(2);
|
||||||
|
auto inpQuantAttr = TensorUtils::getDescribe(inputs[0])->quantAttr;
|
||||||
|
auto outQuantAttr = TensorUtils::getDescribe(outputs[0])->quantAttr;
|
||||||
|
mInpZero[0] = inpQuantAttr->zero;
|
||||||
|
mOutZero[0] = outQuantAttr->zero;
|
||||||
|
mInpScale[0] = inpQuantAttr->scale;
|
||||||
|
mOutScale[0] = outQuantAttr->scale == 0.f? 0.f : 1.0f / outQuantAttr->scale;
|
||||||
|
mMaxMinValue[0] = outQuantAttr->max;
|
||||||
|
mMaxMinValue[1] = outQuantAttr->min;
|
||||||
}
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
CPULayerNorm::~CPULayerNorm() {
|
CPULayerNorm::~CPULayerNorm() {
|
||||||
if (gamma_.get()) {
|
if (mGamma.get()) {
|
||||||
backend()->onReleaseBuffer(gamma_.get(), Backend::STATIC);
|
backend()->onReleaseBuffer(mGamma.get(), Backend::STATIC);
|
||||||
}
|
}
|
||||||
if (beta_.get()) {
|
if (mBeta.get()) {
|
||||||
backend()->onReleaseBuffer(beta_.get(), Backend::STATIC);
|
backend()->onReleaseBuffer(mBeta.get(), Backend::STATIC);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class CPULayerNormCreator : public CPUBackend::Creator {
|
class CPULayerNormCreator : public CPUBackend::Creator {
|
||||||
public:
|
public:
|
||||||
Execution* onCreate(const std::vector<Tensor*>& inputs,
|
Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* backend) const override {
|
||||||
const std::vector<Tensor*>& outputs,
|
|
||||||
const MNN::Op* op, Backend* backend) const override {
|
|
||||||
return new CPULayerNorm(op, backend);
|
return new CPULayerNorm(op, backend);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
//
|
||||||
|
// CPULayerNorm.hpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/07/11
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef CPULayerNorm_hpp
|
||||||
|
#define CPULayerNorm_hpp
|
||||||
|
|
||||||
|
#include "core/Execution.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
namespace MNN {
|
||||||
|
class CPULayerNorm : public Execution {
|
||||||
|
public:
|
||||||
|
explicit CPULayerNorm(const MNN::Op* op, Backend* backend);
|
||||||
|
virtual ~CPULayerNorm();
|
||||||
|
|
||||||
|
ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
|
||||||
|
ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
|
||||||
|
private:
|
||||||
|
bool allocGammaBeta(int size);
|
||||||
|
private:
|
||||||
|
int mAxis = 0;
|
||||||
|
int mInnerSize = 1;
|
||||||
|
int mOutterSize = 1;
|
||||||
|
int mGroup = 1;
|
||||||
|
float mEpsilon = 0.001;
|
||||||
|
std::unique_ptr<Tensor> mGamma;
|
||||||
|
std::unique_ptr<Tensor> mBeta;
|
||||||
|
bool mIniGammaBeta = false;
|
||||||
|
// LayerNormInt8 parameters.
|
||||||
|
std::vector<float> mInpScale;
|
||||||
|
std::vector<float> mOutScale;
|
||||||
|
std::vector<ssize_t> mInpZero;
|
||||||
|
std::vector<ssize_t> mOutZero;
|
||||||
|
std::vector<ssize_t> mMaxMinValue;
|
||||||
|
};
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* CPULayerNorm_hpp */
|
|
@ -14,6 +14,7 @@
|
||||||
#include "core/Macro.h"
|
#include "core/Macro.h"
|
||||||
#include "core/Concurrency.h"
|
#include "core/Concurrency.h"
|
||||||
#include "core/BufferAllocator.hpp"
|
#include "core/BufferAllocator.hpp"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
#include "math/Vec.hpp"
|
#include "math/Vec.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,40 +95,36 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
||||||
auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes);
|
auto ATPtrAlloc = bufferAlloc->alloc(UP_DIV(l, core->pack) * e * core->pack * core->bytes);
|
||||||
auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes);
|
auto BTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, hP) * UP_DIV(l, lP) * lP * hP * core->bytes);
|
||||||
auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes);
|
auto CTPtrAlloc = bufferAlloc->alloc(UP_DIV(h, core->pack) * e * core->pack * core->bytes);
|
||||||
if (nullptr == ATPtrAlloc.first || nullptr == BTPtrAlloc.first || nullptr == CTPtrAlloc.first) {
|
if (ATPtrAlloc.invalid() || BTPtrAlloc.invalid() || CTPtrAlloc.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
auto BTPtr = (uint8_t*)BTPtrAlloc.first + BTPtrAlloc.second;
|
|
||||||
auto ATPtr = (uint8_t*)ATPtrAlloc.first + ATPtrAlloc.second;
|
|
||||||
auto CTPtr = (uint8_t*)CTPtrAlloc.first + CTPtrAlloc.second;
|
|
||||||
|
|
||||||
float* BTempPtr = (float*)BTPtr;
|
|
||||||
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
int numberThread = mSupportMultiThread ? ((CPUBackend*)backend())->threadNumber() : 1;
|
||||||
mPreFunctions.emplace_back(std::make_pair([BTempPtr, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
mPreFunctions.emplace_back(std::make_pair([BTPtrAlloc, l, h, this, core] (int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||||
core->MNNPackForMatMul_B(BTempPtr, BPtr, h, l, mTransposeB);
|
core->MNNPackForMatMul_B((float*)BTPtrAlloc.ptr(), BPtr, h, l, mTransposeB);
|
||||||
} , 1));
|
} , 1));
|
||||||
if (mTransposeA) {
|
if (mTransposeA) {
|
||||||
// l, e -> lC4, e, 4
|
// l, e -> lC4, e, 4
|
||||||
mPreFunctions.emplace_back(std::make_pair([ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
mPreFunctions.emplace_back(std::make_pair([ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||||
int offset[] = {
|
int offset[] = {
|
||||||
e, e
|
e, e
|
||||||
};
|
};
|
||||||
core->MNNPackCUnit((float*)ATPtr, APtr, e, l, offset);
|
core->MNNPackCUnit((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
|
||||||
}, 1));
|
}, 1));
|
||||||
} else {
|
} else {
|
||||||
// e, l -> lC4, e, 4
|
// e, l -> lC4, e, 4
|
||||||
mPreFunctions.emplace_back(std::make_pair(
|
mPreFunctions.emplace_back(std::make_pair(
|
||||||
[ATPtr, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
[ATPtrAlloc, e, l, core](int tId, const float* APtr, const float* BPtr, const float* Bias) {
|
||||||
int offset[] = {
|
int offset[] = {
|
||||||
e, e
|
e, e
|
||||||
};
|
};
|
||||||
core->MNNPackCUnitTranspose((float*)ATPtr, APtr, e, l, offset);
|
core->MNNPackCUnitTranspose((float*)ATPtrAlloc.ptr(), APtr, e, l, offset);
|
||||||
}, 1));
|
}, 1));
|
||||||
}
|
}
|
||||||
bool useBias = false;
|
bool useBias = false;
|
||||||
uint8_t* biasPtr = nullptr;
|
|
||||||
std::vector<float> postParameters;
|
std::vector<float> postParameters;
|
||||||
std::pair<void*, int> bdestAlloc = std::make_pair(nullptr, 0);
|
MemChunk bdestAlloc;
|
||||||
|
bool bdestNeedFree = false;
|
||||||
if (inputs.size() > 2) {
|
if (inputs.size() > 2) {
|
||||||
auto bias = inputs[2];
|
auto bias = inputs[2];
|
||||||
useBias = true;
|
useBias = true;
|
||||||
|
@ -136,19 +133,20 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
||||||
mStrassenUseBiasDirectly = false;
|
mStrassenUseBiasDirectly = false;
|
||||||
// Padding to align of 4
|
// Padding to align of 4
|
||||||
bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
|
bdestAlloc = bufferAlloc->alloc(UP_DIV(biasLength, core->pack) * core->pack * core->bytes);
|
||||||
if (bdestAlloc.first == nullptr) {
|
bdestNeedFree = true;
|
||||||
|
if (bdestAlloc.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
auto bdest = (float*)((uint8_t*)bdestAlloc.first + bdestAlloc.second);
|
|
||||||
mPreFunctions.emplace_back(std::make_pair(
|
mPreFunctions.emplace_back(std::make_pair(
|
||||||
[biasLength, bdest, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
|
[biasLength, bdestAlloc, core](int tId, const float* APtr, const float* BPtr, const float* borigin) {
|
||||||
::memset(bdest, 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
|
::memset(bdestAlloc.ptr(), 0, UP_DIV(biasLength, core->pack) * core->bytes * core->pack);
|
||||||
::memcpy(bdest, borigin, biasLength * core->bytes);
|
::memcpy(bdestAlloc.ptr(), borigin, biasLength * core->bytes);
|
||||||
}, 1));
|
}, 1));
|
||||||
biasPtr = (uint8_t*)bdest;
|
|
||||||
} else {
|
} else {
|
||||||
mStrassenUseBiasDirectly = true;
|
mStrassenUseBiasDirectly = true;
|
||||||
biasPtr = bias->host<uint8_t>();
|
if (TensorUtils::getDescribe(bias)->mem.get()) {
|
||||||
|
bdestAlloc = TensorUtils::getDescribe(bias)->mem->chunk();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
postParameters = {
|
postParameters = {
|
||||||
1.0f,
|
1.0f,
|
||||||
|
@ -157,29 +155,29 @@ ErrorCode CPUMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
||||||
std::numeric_limits<float>().max(),
|
std::numeric_limits<float>().max(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtr, BTPtr, CTPtr, useBias, biasPtr, postParameters);
|
auto code = mComputer->onEncode(e, l, h, e * core->pack, UP_DIV(l, lP) * lP * hP, e * core->pack, ATPtrAlloc, BTPtrAlloc, CTPtrAlloc, useBias, bdestAlloc, postParameters);
|
||||||
if (NO_ERROR != code) {
|
if (NO_ERROR != code) {
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
if (bdestAlloc.first != nullptr) {
|
if (bdestNeedFree) {
|
||||||
bufferAlloc->free(bdestAlloc);
|
bufferAlloc->free(bdestAlloc);
|
||||||
}
|
}
|
||||||
// hC4, e, 4 -> e, h
|
// hC4, e, 4 -> e, h
|
||||||
if (mTransposeC) {
|
if (mTransposeC) {
|
||||||
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
|
mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
|
||||||
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
||||||
int offset[] = {
|
int offset[] = {
|
||||||
e, e
|
e, e
|
||||||
};
|
};
|
||||||
core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtr, e, h, offset);
|
core->MNNUnpackCUnitTranspose(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
|
||||||
}, 1));
|
}, 1));
|
||||||
} else {
|
} else {
|
||||||
mPostFunctions.emplace_back(std::make_pair([CTPtr, e, h, core](
|
mPostFunctions.emplace_back(std::make_pair([CTPtrAlloc, e, h, core](
|
||||||
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
int tId, const float* APtr, const float* BPtr, const float* biasPtr, float* CPtr) {
|
||||||
int offset[] = {
|
int offset[] = {
|
||||||
e, e
|
e, e
|
||||||
};
|
};
|
||||||
core->MNNUnpackCUnit(CPtr, (float*)CTPtr, e, h, offset);
|
core->MNNUnpackCUnit(CPtr, (float*)CTPtrAlloc.ptr(), e, h, offset);
|
||||||
}, 1));
|
}, 1));
|
||||||
}
|
}
|
||||||
bufferAlloc->free(ATPtrAlloc);
|
bufferAlloc->free(ATPtrAlloc);
|
||||||
|
|
|
@ -55,8 +55,6 @@ public:
|
||||||
padWidth = padHeight = 0;
|
padWidth = padHeight = 0;
|
||||||
}
|
}
|
||||||
auto totalDepth = input->batch() * UP_DIV(input->channel(), core->pack);
|
auto totalDepth = input->batch() * UP_DIV(input->channel(), core->pack);
|
||||||
auto inputData = input->host<uint8_t>();
|
|
||||||
auto outputData = output->host<uint8_t>();
|
|
||||||
auto inputPlaneStride = core->pack * input->width() * input->height();
|
auto inputPlaneStride = core->pack * input->width() * input->height();
|
||||||
auto outputPlaneStride = core->pack * output->width() * output->height();
|
auto outputPlaneStride = core->pack * output->width() * output->height();
|
||||||
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
int threadNumber = ((CPUBackend *)backend())->threadNumber();
|
||||||
|
@ -67,6 +65,8 @@ public:
|
||||||
}
|
}
|
||||||
mFunction = std::make_pair(threadNumber, [=](int tId) {
|
mFunction = std::make_pair(threadNumber, [=](int tId) {
|
||||||
for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
|
for (int channel = (int)tId; channel < totalDepth; channel += threadNumber) {
|
||||||
|
auto inputData = input->host<uint8_t>();
|
||||||
|
auto outputData = output->host<uint8_t>();
|
||||||
// run
|
// run
|
||||||
mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(),
|
mCompute(inputData + channel * inputPlaneStride * mBytes, input->width(), input->height(),
|
||||||
outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth,
|
outputData + outputPlaneStride * channel * mBytes, output->width(), output->height(), kernelWidth,
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
#include "backend/cpu/CPUBackend.hpp"
|
#include "backend/cpu/CPUBackend.hpp"
|
||||||
#include "core/Concurrency.h"
|
#include "core/Concurrency.h"
|
||||||
#include "CPUTensorConvert.hpp"
|
#include "CPUTensorConvert.hpp"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
//#define MNN_OPEN_TIME_TRACE
|
//#define MNN_OPEN_TIME_TRACE
|
||||||
#include <MNN/AutoTime.hpp>
|
#include <MNN/AutoTime.hpp>
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
|
@ -101,131 +102,129 @@ static void pickBoxes(const std::vector<score_box_t> &boxes, std::vector<long> &
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode CPUProposal::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
// score transform space
|
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||||
auto &score = inputs[0];
|
mScoreBuffer = bufferAlloc->alloc(TensorUtils::getRawSize(inputs[0]) * inputs[0]->getType().bytes());
|
||||||
memcpy(mScore.buffer().dim, score->buffer().dim, sizeof(halide_dimension_t) * score->buffer().dimensions);
|
if (mScoreBuffer.invalid()) {
|
||||||
backend()->onAcquireBuffer(&mScore, Backend::DYNAMIC);
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
// release temp buffer space
|
// release temp buffer space
|
||||||
backend()->onReleaseBuffer(&mScore, Backend::DYNAMIC);
|
bufferAlloc->free(mScoreBuffer);
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
auto &imInfo = inputs[2];
|
ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
// score transform space
|
||||||
|
auto score = inputs[0];
|
||||||
|
auto boxes = inputs[1];
|
||||||
|
auto imInfo = inputs[2];
|
||||||
auto featStride = mProposal->featStride();
|
auto featStride = mProposal->featStride();
|
||||||
auto preNmsTopN = mProposal->preNmsTopN();
|
auto preNmsTopN = mProposal->preNmsTopN();
|
||||||
auto nmsThreshold = mProposal->nmsThreshold();
|
auto nmsThreshold = mProposal->nmsThreshold();
|
||||||
auto afterNmsTopN = mProposal->afterNmsTopN();
|
auto afterNmsTopN = mProposal->afterNmsTopN();
|
||||||
auto minSize = mProposal->minSize();
|
auto minSize = mProposal->minSize();
|
||||||
|
|
||||||
auto boxes = inputs[1];
|
float* tmpScorePtr = (float*)mScoreBuffer.ptr();
|
||||||
|
// download
|
||||||
|
MNNUnpackC4Origin(tmpScorePtr, score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
|
||||||
|
|
||||||
mRun = [=]() {
|
auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight;
|
||||||
// download
|
auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight;
|
||||||
MNNUnpackC4Origin(mScore.host<float>(), score->host<float>(), score->width() * score->height(), score->channel(), score->width() * score->height());
|
auto imH = imInfo->host<float>()[0]; // NC/4HW4
|
||||||
|
auto imW = imInfo->host<float>()[1]; // NC/4HW4
|
||||||
|
|
||||||
auto scrWidth = score->width(), scrHeight = score->height(), scrSize = scrWidth * scrHeight;
|
// generate proposals from box deltas and shifted anchors
|
||||||
auto boxWidth = boxes->width(), boxHeight = boxes->height(), boxSize = boxWidth * boxHeight;
|
// remove predicted boxes with either height or width < threshold
|
||||||
auto imH = imInfo->host<float>()[0]; // NC/4HW4
|
auto anchorWidth = 4;
|
||||||
auto imW = imInfo->host<float>()[1]; // NC/4HW4
|
auto anchorHeight = mAnchors.size() / 4;
|
||||||
|
std::vector<score_box_t> proposalBoxes;
|
||||||
|
float imScale = imInfo->host<float>()[2]; // NC/4HW4
|
||||||
|
float minBoxSize = minSize * imScale;
|
||||||
|
proposalBoxes.reserve(boxSize * anchorHeight);
|
||||||
|
|
||||||
// generate proposals from box deltas and shifted anchors
|
{
|
||||||
// remove predicted boxes with either height or width < threshold
|
for (int ah = 0; ah < anchorHeight; ++ah) {
|
||||||
auto anchorWidth = 4;
|
auto boxPtr = boxes->host<float>() + ah * 4 * boxSize;
|
||||||
auto anchorHeight = mAnchors.size() / 4;
|
auto scorePtr = tmpScorePtr + (ah + anchorHeight) * scrSize;
|
||||||
std::vector<score_box_t> proposalBoxes;
|
|
||||||
float imScale = imInfo->host<float>()[2]; // NC/4HW4
|
|
||||||
float minBoxSize = minSize * imScale;
|
|
||||||
proposalBoxes.reserve(boxSize * anchorHeight);
|
|
||||||
|
|
||||||
{
|
// shifted anchor
|
||||||
for (int ah = 0; ah < anchorHeight; ++ah) {
|
const auto anchor = mAnchors.get() + ah * anchorWidth;
|
||||||
auto boxPtr = boxes->host<float>() + ah * 4 * boxSize;
|
float anchorY = anchor[1];
|
||||||
auto scorePtr = mScore.host<float>() + (ah + anchorHeight) * scrSize;
|
float anchorW = anchor[2] - anchor[0];
|
||||||
|
float anchorH = anchor[3] - anchor[1];
|
||||||
|
|
||||||
// shifted anchor
|
for (int sh = 0; sh < scrHeight; sh++) {
|
||||||
const auto anchor = mAnchors.get() + ah * anchorWidth;
|
float anchorX = anchor[0];
|
||||||
float anchorY = anchor[1];
|
auto boxPtrH = boxPtr + sh * 4 * boxWidth;
|
||||||
float anchorW = anchor[2] - anchor[0];
|
|
||||||
float anchorH = anchor[3] - anchor[1];
|
|
||||||
|
|
||||||
for (int sh = 0; sh < scrHeight; sh++) {
|
for (int sw = 0; sw < scrWidth; sw++) {
|
||||||
float anchorX = anchor[0];
|
auto box = boxPtrH + 4 * sw;
|
||||||
auto boxPtrH = boxPtr + sh * 4 * boxWidth;
|
// apply center size
|
||||||
|
float cx = anchorX + anchorW * 0.5f + anchorW * box[0];
|
||||||
|
float cy = anchorY + anchorH * 0.5f + anchorH * box[1];
|
||||||
|
float w = anchorW * exp(box[2]);
|
||||||
|
float h = anchorH * exp(box[3]);
|
||||||
|
|
||||||
for (int sw = 0; sw < scrWidth; sw++) {
|
float minX = std::max(std::min(cx - w * 0.5f, imW - 1), 0.f);
|
||||||
auto box = boxPtrH + 4 * sw;
|
float minY = std::max(std::min(cy - h * 0.5f, imH - 1), 0.f);
|
||||||
// apply center size
|
float maxX = std::max(std::min(cx + w * 0.5f, imW - 1), 0.f);
|
||||||
float cx = anchorX + anchorW * 0.5f + anchorW * box[0];
|
float maxY = std::max(std::min(cy + h * 0.5f, imH - 1), 0.f);
|
||||||
float cy = anchorY + anchorH * 0.5f + anchorH * box[1];
|
if (maxX - minX + 1 >= minBoxSize && maxY - minY + 1 >= minBoxSize) {
|
||||||
float w = anchorW * exp(box[2]);
|
proposalBoxes.emplace_back(box_rect(minX, minY, maxX, maxY, scorePtr[sh * scrWidth + sw]));
|
||||||
float h = anchorH * exp(box[3]);
|
|
||||||
|
|
||||||
float minX = std::max(std::min(cx - w * 0.5f, imW - 1), 0.f);
|
|
||||||
float minY = std::max(std::min(cy - h * 0.5f, imH - 1), 0.f);
|
|
||||||
float maxX = std::max(std::min(cx + w * 0.5f, imW - 1), 0.f);
|
|
||||||
float maxY = std::max(std::min(cy + h * 0.5f, imH - 1), 0.f);
|
|
||||||
if (maxX - minX + 1 >= minBoxSize && maxY - minY + 1 >= minBoxSize) {
|
|
||||||
proposalBoxes.emplace_back(box_rect(minX, minY, maxX, maxY, scorePtr[sh * scrWidth + sw]));
|
|
||||||
}
|
|
||||||
anchorX += featStride;
|
|
||||||
}
|
}
|
||||||
anchorY += featStride;
|
anchorX += featStride;
|
||||||
}
|
}
|
||||||
|
anchorY += featStride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// sort all (proposal, score) pairs by score from highest to lowest
|
// sort all (proposal, score) pairs by score from highest to lowest
|
||||||
// take top preNmsTopN
|
// take top preNmsTopN
|
||||||
auto compareFunction = [](const score_box_t &a, const score_box_t &b) {
|
auto compareFunction = [](const score_box_t &a, const score_box_t &b) {
|
||||||
return box_score(a) > box_score(b);
|
return box_score(a) > box_score(b);
|
||||||
};
|
};
|
||||||
if (0 < preNmsTopN && preNmsTopN < (int)proposalBoxes.size()) {
|
if (0 < preNmsTopN && preNmsTopN < (int)proposalBoxes.size()) {
|
||||||
std::partial_sort(proposalBoxes.begin(), proposalBoxes.begin() + preNmsTopN, proposalBoxes.end(),
|
std::partial_sort(proposalBoxes.begin(), proposalBoxes.begin() + preNmsTopN, proposalBoxes.end(),
|
||||||
compareFunction);
|
compareFunction);
|
||||||
proposalBoxes.resize(preNmsTopN);
|
proposalBoxes.resize(preNmsTopN);
|
||||||
} else {
|
} else {
|
||||||
std::sort(proposalBoxes.begin(), proposalBoxes.end(), compareFunction);
|
std::sort(proposalBoxes.begin(), proposalBoxes.end(), compareFunction);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// apply nms with nmsThreshold
|
// apply nms with nmsThreshold
|
||||||
// take afterNmsTopN
|
// take afterNmsTopN
|
||||||
std::vector<long> picked;
|
std::vector<long> picked;
|
||||||
picked.reserve(afterNmsTopN);
|
picked.reserve(afterNmsTopN);
|
||||||
{
|
{
|
||||||
pickBoxes(proposalBoxes, picked, nmsThreshold, afterNmsTopN);
|
pickBoxes(proposalBoxes, picked, nmsThreshold, afterNmsTopN);
|
||||||
|
}
|
||||||
|
|
||||||
|
int pickedCount = std::min((int)picked.size(), afterNmsTopN);
|
||||||
|
|
||||||
|
// return the top proposals
|
||||||
|
int roiStep = outputs[0]->buffer().dim[0].stride, scoreStep = 0;
|
||||||
|
auto roiPtr = outputs[0]->host<float>(), scoresPtr = (float *)NULL;
|
||||||
|
memset(roiPtr, 0, outputs[0]->size());
|
||||||
|
|
||||||
|
if (outputs.size() > 1) {
|
||||||
|
scoreStep = outputs[1]->buffer().dim[0].stride;
|
||||||
|
scoresPtr = outputs[1]->host<float>();
|
||||||
|
memset(scoresPtr, 0, outputs[1]->size());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < pickedCount; i++, scoresPtr += scoreStep) {
|
||||||
|
auto box = proposalBoxes[picked[i]];
|
||||||
|
roiPtr[i * 4 + 0] = 0;
|
||||||
|
roiPtr[i * 4 + 1] = box_rect_xmin(box);
|
||||||
|
roiPtr[i * 4 + 2] = box_rect_ymin(box);
|
||||||
|
roiPtr[i * 4 + 3] = box_rect_xmax(box);
|
||||||
|
roiPtr[i * 4 + outputs[0]->length(0) * 4] = box_rect_ymax(box);
|
||||||
|
if (scoresPtr) {
|
||||||
|
scoresPtr[0] = box_score(box);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
int pickedCount = std::min((int)picked.size(), afterNmsTopN);
|
|
||||||
|
|
||||||
// return the top proposals
|
|
||||||
int roiStep = outputs[0]->buffer().dim[0].stride, scoreStep = 0;
|
|
||||||
auto roiPtr = outputs[0]->host<float>(), scoresPtr = (float *)NULL;
|
|
||||||
memset(roiPtr, 0, outputs[0]->size());
|
|
||||||
|
|
||||||
if (outputs.size() > 1) {
|
|
||||||
scoreStep = outputs[1]->buffer().dim[0].stride;
|
|
||||||
scoresPtr = outputs[1]->host<float>();
|
|
||||||
memset(scoresPtr, 0, outputs[1]->size());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < pickedCount; i++, scoresPtr += scoreStep) {
|
|
||||||
auto box = proposalBoxes[picked[i]];
|
|
||||||
roiPtr[i * 4 + 0] = 0;
|
|
||||||
roiPtr[i * 4 + 1] = box_rect_xmin(box);
|
|
||||||
roiPtr[i * 4 + 2] = box_rect_ymin(box);
|
|
||||||
roiPtr[i * 4 + 3] = box_rect_xmax(box);
|
|
||||||
roiPtr[i * 4 + outputs[0]->length(0) * 4] = box_rect_ymax(box);
|
|
||||||
if (scoresPtr) {
|
|
||||||
scoresPtr[0] = box_score(box);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
return NO_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorCode CPUProposal::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
|
||||||
mRun();
|
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include "core/AutoStorage.h"
|
#include "core/AutoStorage.h"
|
||||||
#include "core/Execution.hpp"
|
#include "core/Execution.hpp"
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
#include "MNN_generated.h"
|
#include "MNN_generated.h"
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
|
@ -26,8 +27,7 @@ public:
|
||||||
private:
|
private:
|
||||||
const Proposal *mProposal;
|
const Proposal *mProposal;
|
||||||
AutoStorage<float> mAnchors;
|
AutoStorage<float> mAnchors;
|
||||||
Tensor mScore;
|
MemChunk mScoreBuffer;
|
||||||
std::function<void()> mRun;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
|
@ -68,7 +68,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
}
|
}
|
||||||
Tensor::InsideDescribe::Region newRegion;
|
Tensor::InsideDescribe::Region newRegion;
|
||||||
OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
|
OpCommonUtils::turnToPackRegion(slice, newRegion, output, core->pack, true);
|
||||||
mFastBlit.emplace_back(std::make_pair(slice.origin->host<void>(), std::move(newRegion)));
|
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
|
||||||
}
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -98,12 +98,12 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
for (int i=0; i< des->regions.size(); ++i) {
|
for (int i=0; i< des->regions.size(); ++i) {
|
||||||
auto& slice = des->regions[i];
|
auto& slice = des->regions[i];
|
||||||
auto origin = slice.origin;
|
auto origin = slice.origin;
|
||||||
if (nullptr == origin || nullptr == origin->host<void>()) {
|
if (nullptr == origin /*|| nullptr == origin->host<void>()*/) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// if tensor is not NC4HW4 or has been merged, don't need deal
|
// if tensor is not NC4HW4 or has been merged, don't need deal
|
||||||
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
||||||
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), &slice));
|
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it
|
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it
|
||||||
|
@ -132,12 +132,13 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
|
bool merge = TensorUtils::fuseRegion(regionTmp, *newSlice);
|
||||||
if (merge) {
|
if (merge) {
|
||||||
// cache the merged tensor
|
// cache the merged tensor
|
||||||
mTempInputCopy.emplace_back(std::make_pair(origin->host<void>(), newSlice.get()));
|
mTempInputCopy.emplace_back(std::make_pair(origin, newSlice.get()));
|
||||||
mCacheRegions.emplace_back(newSlice);
|
mCacheRegions.emplace_back(newSlice);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto cache = static_cast<CPUBackend*>(backend())->getCache();
|
auto cache = static_cast<CPUBackend*>(backend())->getCache();
|
||||||
|
#if 1
|
||||||
auto tempTensor = cache->findCacheTensor(origin, midFormat);
|
auto tempTensor = cache->findCacheTensor(origin, midFormat);
|
||||||
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
|
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
|
||||||
if (nullptr == tempTensor) {
|
if (nullptr == tempTensor) {
|
||||||
|
@ -159,7 +160,23 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
|
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
|
||||||
forRelease.emplace_back(tempTensor);
|
forRelease.emplace_back(tempTensor);
|
||||||
}
|
}
|
||||||
mTempInputCopy.emplace_back(std::make_pair(tempTensor->host<void>(), &slice));
|
#else
|
||||||
|
std::shared_ptr<Tensor> newTensor(new Tensor);
|
||||||
|
TensorUtils::copyShape(origin, newTensor.get());
|
||||||
|
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
|
||||||
|
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
|
||||||
|
newTensor->buffer().type = origin->getType();
|
||||||
|
TensorUtils::setLinearLayout(newTensor.get());
|
||||||
|
mTempInput.insert(std::make_pair(origin, newTensor.get()));
|
||||||
|
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
|
||||||
|
if (!res) {
|
||||||
|
return OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
auto tempTensor = newTensor.get();
|
||||||
|
backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
|
||||||
|
cache->pushCacheTensor(newTensor, origin, midFormat);
|
||||||
|
#endif
|
||||||
|
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
|
||||||
}
|
}
|
||||||
for (auto t : forRelease) {
|
for (auto t : forRelease) {
|
||||||
backend()->onReleaseBuffer(t, Backend::DYNAMIC);
|
backend()->onReleaseBuffer(t, Backend::DYNAMIC);
|
||||||
|
@ -175,7 +192,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
if (region->size[0] * region->size[1] * region->size[2] < thredHold) {
|
if (region->size[0] * region->size[1] * region->size[2] < thredHold) {
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
auto ptr = mTempInputCopy[0].first;
|
auto tensorPtr = mTempInputCopy[0].first;
|
||||||
int pos = -1;
|
int pos = -1;
|
||||||
for (int i=0; i<3; ++i) {
|
for (int i=0; i<3; ++i) {
|
||||||
if (region->size[i] > 1) {
|
if (region->size[i] > 1) {
|
||||||
|
@ -212,7 +229,7 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
|
||||||
for (int v=pos+1; v<3; ++v) {
|
for (int v=pos+1; v<3; ++v) {
|
||||||
cacheReg.size[v] = region->size[v];
|
cacheReg.size[v] = region->size[v];
|
||||||
}
|
}
|
||||||
mTempInputCopy.emplace_back(std::make_pair(ptr, cacheRegPtr.get()));
|
mTempInputCopy.emplace_back(std::make_pair(tensorPtr, cacheRegPtr.get()));
|
||||||
mCacheRegions.emplace_back(cacheRegPtr);
|
mCacheRegions.emplace_back(cacheRegPtr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -318,7 +335,7 @@ void CPURaster::executeFaster(const std::vector<Tensor *> &inputs, const std::ve
|
||||||
auto& iter = mFastBlit[u];
|
auto& iter = mFastBlit[u];
|
||||||
auto& slice = iter.second;
|
auto& slice = iter.second;
|
||||||
//Offset use byte
|
//Offset use byte
|
||||||
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
|
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
|
||||||
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
||||||
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
|
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
|
||||||
for (int z=0; z<slice.size[0]; ++z) {
|
for (int z=0; z<slice.size[0]; ++z) {
|
||||||
|
@ -543,6 +560,11 @@ void CPURaster::tensorConvert(Tensor* input, Tensor* output, int bytes) {
|
||||||
|
|
||||||
|
|
||||||
ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
if (nullptr != mTempOutput) {
|
||||||
|
mOutputPtr = mTempOutput->host<void>();
|
||||||
|
} else {
|
||||||
|
mOutputPtr = outputs[0]->host<void>();
|
||||||
|
}
|
||||||
if (mFast) {
|
if (mFast) {
|
||||||
executeFaster(____inputs, outputs);
|
executeFaster(____inputs, outputs);
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
|
@ -607,7 +629,7 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
|
||||||
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
|
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
|
||||||
auto& iter = mTempInputCopy[u];
|
auto& iter = mTempInputCopy[u];
|
||||||
auto& slice = *(iter.second);
|
auto& slice = *(iter.second);
|
||||||
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
|
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
|
||||||
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
|
||||||
_blit(slice, bytes, srcPtr, dstPtr, proc);
|
_blit(slice, bytes, srcPtr, dstPtr, proc);
|
||||||
}
|
}
|
||||||
|
@ -752,13 +774,12 @@ public:
|
||||||
}
|
}
|
||||||
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
|
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
|
||||||
if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
|
if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
|
||||||
auto buffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
|
mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
|
||||||
if (nullptr == buffer.first) {
|
if (mCacheBuffer.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
mCacheBuffer = (uint8_t*)buffer.first + buffer.second;
|
|
||||||
mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
|
mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
|
||||||
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(buffer);
|
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
|
||||||
}
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -887,7 +908,7 @@ public:
|
||||||
auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
|
auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
|
||||||
auto dst = dstOrigin;
|
auto dst = dstOrigin;
|
||||||
if (cmd->fuse() >= 0) {
|
if (cmd->fuse() >= 0) {
|
||||||
dst = fuseBuffer;
|
dst = fuseBuffer.ptr();
|
||||||
}
|
}
|
||||||
do {
|
do {
|
||||||
if (OpType_UnaryOp == op->type()) {
|
if (OpType_UnaryOp == op->type()) {
|
||||||
|
@ -921,7 +942,7 @@ public:
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Blit to cache
|
// Blit to cache
|
||||||
auto srcCache = mCacheBuffer + mMaxCacheSize * tId;
|
auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
|
||||||
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
||||||
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
|
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
|
||||||
auto dstZ = dst + z * outputStride[0] * bytes;
|
auto dstZ = dst + z * outputStride[0] * bytes;
|
||||||
|
@ -978,7 +999,7 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto cache0 = mCacheBuffer + mMaxCacheSize * tId;
|
auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
|
||||||
auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
|
auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
|
||||||
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
for (int z=0; z<cmd->size()->data()[0]; ++z) {
|
||||||
auto src0Z = src0 + z * stride1[0] * bytes;
|
auto src0Z = src0 + z * stride1[0] * bytes;
|
||||||
|
@ -1080,9 +1101,8 @@ private:
|
||||||
const LoopParam* mLoop;
|
const LoopParam* mLoop;
|
||||||
std::vector<Tensor*> mStack;
|
std::vector<Tensor*> mStack;
|
||||||
std::vector<ThreadContainer> mContainer;
|
std::vector<ThreadContainer> mContainer;
|
||||||
uint8_t* mCacheBuffer = nullptr;
|
MemChunk mCacheBuffer, mFuseBuffer;
|
||||||
int mMaxCacheSize = 0;
|
int mMaxCacheSize = 0;
|
||||||
uint8_t* mFuseBuffer = nullptr;
|
|
||||||
int mMaxFuseBufferSize = 0;
|
int mMaxFuseBufferSize = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -28,8 +28,8 @@ public:
|
||||||
void tensorConvert(Tensor* input, Tensor* output, int bytes);
|
void tensorConvert(Tensor* input, Tensor* output, int bytes);
|
||||||
private:
|
private:
|
||||||
std::map<Tensor*, Tensor*> mTempInput;
|
std::map<Tensor*, Tensor*> mTempInput;
|
||||||
std::vector<std::pair<void*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
|
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
|
||||||
std::vector<std::pair<void*, Tensor::InsideDescribe::Region>> mFastBlit;
|
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
|
||||||
std::shared_ptr<Tensor> mTempOutput;
|
std::shared_ptr<Tensor> mTempOutput;
|
||||||
void* mOutputPtr;
|
void* mOutputPtr;
|
||||||
bool mNeedZero = false;
|
bool mNeedZero = false;
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#include "CPUResizeCache.hpp"
|
#include "CPUResizeCache.hpp"
|
||||||
|
#include "../../core/TensorUtils.hpp"
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const {
|
Tensor* CPUResizeCache::findCacheTensor(const Tensor* src, MNN_DATA_FORMAT format) const {
|
||||||
auto iter = mFormatCache.find(std::make_pair(src, format));
|
auto iter = mFormatCache.find(std::make_pair(src, format));
|
||||||
|
@ -14,5 +16,9 @@ void CPUResizeCache::pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor*
|
||||||
void CPUResizeCache::reset() {
|
void CPUResizeCache::reset() {
|
||||||
mFormatCache.clear();
|
mFormatCache.clear();
|
||||||
}
|
}
|
||||||
|
void CPUResizeCache::release() {
|
||||||
|
for (auto iter : mFormatCache) {
|
||||||
|
TensorUtils::getDescribe(iter.second.get())->mem.reset(nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -19,6 +19,7 @@ public:
|
||||||
// Return cache tensor
|
// Return cache tensor
|
||||||
void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format);
|
void pushCacheTensor(std::shared_ptr<Tensor> dst, const Tensor* src, MNN_DATA_FORMAT format);
|
||||||
void reset();
|
void reset();
|
||||||
|
void release();
|
||||||
private:
|
private:
|
||||||
std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
|
std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
|
||||||
};
|
};
|
||||||
|
|
|
@ -647,7 +647,7 @@ L1Loop:
|
||||||
ld1 {v4.8b}, [x1], #8 // src: k:6,7
|
ld1 {v4.8b}, [x1], #8 // src: k:6,7
|
||||||
ld1 {v4.s}[2], [x1]
|
ld1 {v4.s}[2], [x1]
|
||||||
|
|
||||||
mov v9.4s, v16.4s
|
mov v9.16b, v16.16b
|
||||||
sxtl2 v6.8h, v4.16b
|
sxtl2 v6.8h, v4.16b
|
||||||
|
|
||||||
tbl v7.16b, {v2.16b, v3.16b}, v24.16b // src0
|
tbl v7.16b, {v2.16b, v3.16b}, v24.16b // src0
|
||||||
|
|
|
@ -84,14 +84,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v1.8h
|
sxtl2 v11.4s, v1.8h
|
||||||
scvtf v0.4s, v8.4s
|
scvtf v0.4s, v8.4s
|
||||||
scvtf v1.4s, v9.4s
|
scvtf v1.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v0.4s, v12.4s
|
fmla v8.4s, v0.4s, v12.4s
|
||||||
fmla v9.4s, v1.4s, v13.4s
|
fmla v9.4s, v1.4s, v13.4s
|
||||||
scvtf v0.4s, v10.4s
|
scvtf v0.4s, v10.4s
|
||||||
scvtf v1.4s, v11.4s
|
scvtf v1.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v0.4s, v12.4s
|
fmla v10.4s, v0.4s, v12.4s
|
||||||
fmla v11.4s, v1.4s, v13.4s
|
fmla v11.4s, v1.4s, v13.4s
|
||||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||||
|
@ -153,14 +153,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v1.8h
|
sxtl2 v11.4s, v1.8h
|
||||||
scvtf v0.4s, v8.4s
|
scvtf v0.4s, v8.4s
|
||||||
scvtf v1.4s, v9.4s
|
scvtf v1.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v0.4s, v12.4s
|
fmla v8.4s, v0.4s, v12.4s
|
||||||
fmla v9.4s, v1.4s, v13.4s
|
fmla v9.4s, v1.4s, v13.4s
|
||||||
scvtf v0.4s, v10.4s
|
scvtf v0.4s, v10.4s
|
||||||
scvtf v1.4s, v11.4s
|
scvtf v1.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v0.4s, v12.4s
|
fmla v10.4s, v0.4s, v12.4s
|
||||||
fmla v11.4s, v1.4s, v13.4s
|
fmla v11.4s, v1.4s, v13.4s
|
||||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||||
|
@ -321,14 +321,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -405,14 +405,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -564,14 +564,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
|
|
||||||
|
@ -616,14 +616,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
|
@ -721,7 +721,7 @@ blt E1
|
||||||
mvni v9.4s, #6
|
mvni v9.4s, #6
|
||||||
add v3.4s, v3.4s, v9.4s
|
add v3.4s, v3.4s, v9.4s
|
||||||
scvtf v3.4s, v3.4s
|
scvtf v3.4s, v3.4s
|
||||||
mov v4.4s, v2.4s
|
mov v4.16b, v2.16b
|
||||||
fmla v4.4s, v3.4s, v1.4s
|
fmla v4.4s, v3.4s, v1.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
|
@ -756,16 +756,16 @@ blt E1
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
ld1 {v1.4s}, [x15], x11
|
ld1 {v1.4s}, [x15], x11
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
ld1 {v2.4s}, [x15], x11
|
ld1 {v2.4s}, [x15], x11
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
ld1 {v3.4s}, [x15], x11
|
ld1 {v3.4s}, [x15], x11
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
@ -810,7 +810,7 @@ blt E1
|
||||||
mvni v9.4s, #6
|
mvni v9.4s, #6
|
||||||
add v3.4s, v3.4s, v9.4s
|
add v3.4s, v3.4s, v9.4s
|
||||||
scvtf v3.4s, v3.4s
|
scvtf v3.4s, v3.4s
|
||||||
mov v4.4s, v2.4s
|
mov v4.16b, v2.16b
|
||||||
fmla v4.4s, v3.4s, v1.4s
|
fmla v4.4s, v3.4s, v1.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
|
@ -840,14 +840,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
||||||
|
@ -953,14 +953,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.s}[0], [x15], x11
|
ld1 {v0.s}[0], [x15], x11
|
||||||
|
@ -989,14 +989,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.s}[0], [x15], x11
|
ld1 {v0.s}[0], [x15], x11
|
||||||
|
@ -1059,14 +1059,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -1102,14 +1102,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v15.4s
|
mov v8.16b, v15.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v15.4s
|
mov v10.16b, v15.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
|
|
@ -74,14 +74,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v1.8h
|
sxtl2 v11.4s, v1.8h
|
||||||
scvtf v0.4s, v8.4s
|
scvtf v0.4s, v8.4s
|
||||||
scvtf v1.4s, v9.4s
|
scvtf v1.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v0.4s, v12.4s
|
fmla v8.4s, v0.4s, v12.4s
|
||||||
fmla v9.4s, v1.4s, v13.4s
|
fmla v9.4s, v1.4s, v13.4s
|
||||||
scvtf v0.4s, v10.4s
|
scvtf v0.4s, v10.4s
|
||||||
scvtf v1.4s, v11.4s
|
scvtf v1.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v0.4s, v12.4s
|
fmla v10.4s, v0.4s, v12.4s
|
||||||
fmla v11.4s, v1.4s, v13.4s
|
fmla v11.4s, v1.4s, v13.4s
|
||||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||||
|
@ -137,14 +137,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v1.8h
|
sxtl2 v11.4s, v1.8h
|
||||||
scvtf v0.4s, v8.4s
|
scvtf v0.4s, v8.4s
|
||||||
scvtf v1.4s, v9.4s
|
scvtf v1.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v0.4s, v12.4s
|
fmla v8.4s, v0.4s, v12.4s
|
||||||
fmla v9.4s, v1.4s, v13.4s
|
fmla v9.4s, v1.4s, v13.4s
|
||||||
scvtf v0.4s, v10.4s
|
scvtf v0.4s, v10.4s
|
||||||
scvtf v1.4s, v11.4s
|
scvtf v1.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v0.4s, v12.4s
|
fmla v10.4s, v0.4s, v12.4s
|
||||||
fmla v11.4s, v1.4s, v13.4s
|
fmla v11.4s, v1.4s, v13.4s
|
||||||
ld1 {v0.4s, v1.4s}, [x15], x11
|
ld1 {v0.4s, v1.4s}, [x15], x11
|
||||||
|
@ -294,14 +294,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -371,14 +371,14 @@ LoopE8:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -520,14 +520,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
|
// st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0]
|
||||||
|
@ -567,14 +567,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
|
@ -669,16 +669,16 @@ blt E1
|
||||||
ld1 {v0.4s}, [x15], x11
|
ld1 {v0.4s}, [x15], x11
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
ld1 {v1.4s}, [x15], x11
|
ld1 {v1.4s}, [x15], x11
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
ld1 {v2.4s}, [x15], x11
|
ld1 {v2.4s}, [x15], x11
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
ld1 {v3.4s}, [x15], x11
|
ld1 {v3.4s}, [x15], x11
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
@ -717,14 +717,14 @@ blt E1
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
// ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x15], x11
|
||||||
|
@ -819,14 +819,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.s}[0], [x15], x11
|
ld1 {v0.s}[0], [x15], x11
|
||||||
|
@ -849,14 +849,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v26.4s
|
mov v8.16b, v26.16b
|
||||||
mov v9.4s, v27.4s
|
mov v9.16b, v27.16b
|
||||||
fmla v8.4s, v12.4s, v24.4s
|
fmla v8.4s, v12.4s, v24.4s
|
||||||
fmla v9.4s, v13.4s, v25.4s
|
fmla v9.4s, v13.4s, v25.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v26.4s
|
mov v10.16b, v26.16b
|
||||||
mov v11.4s, v27.4s
|
mov v11.16b, v27.16b
|
||||||
fmla v10.4s, v12.4s, v24.4s
|
fmla v10.4s, v12.4s, v24.4s
|
||||||
fmla v11.4s, v13.4s, v25.4s
|
fmla v11.4s, v13.4s, v25.4s
|
||||||
ld1 {v0.s}[0], [x15], x11
|
ld1 {v0.s}[0], [x15], x11
|
||||||
|
@ -909,14 +909,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v14.4s
|
mov v8.16b, v14.16b
|
||||||
mov v9.4s, v14.4s
|
mov v9.16b, v14.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v14.4s
|
mov v10.16b, v14.16b
|
||||||
mov v11.4s, v14.4s
|
mov v11.16b, v14.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
@ -944,14 +944,14 @@ LoopE1:
|
||||||
sxtl2 v11.4s, v12.8h
|
sxtl2 v11.4s, v12.8h
|
||||||
scvtf v12.4s, v8.4s
|
scvtf v12.4s, v8.4s
|
||||||
scvtf v13.4s, v9.4s
|
scvtf v13.4s, v9.4s
|
||||||
mov v8.4s, v15.4s
|
mov v8.16b, v15.16b
|
||||||
mov v9.4s, v15.4s
|
mov v9.16b, v15.16b
|
||||||
fmla v8.4s, v12.4s, v4.4s
|
fmla v8.4s, v12.4s, v4.4s
|
||||||
fmla v9.4s, v13.4s, v4.4s
|
fmla v9.4s, v13.4s, v4.4s
|
||||||
scvtf v12.4s, v10.4s
|
scvtf v12.4s, v10.4s
|
||||||
scvtf v13.4s, v11.4s
|
scvtf v13.4s, v11.4s
|
||||||
mov v10.4s, v15.4s
|
mov v10.16b, v15.16b
|
||||||
mov v11.4s, v15.4s
|
mov v11.16b, v15.16b
|
||||||
fmla v10.4s, v12.4s, v4.4s
|
fmla v10.4s, v12.4s, v4.4s
|
||||||
fmla v11.4s, v13.4s, v4.4s
|
fmla v11.4s, v13.4s, v4.4s
|
||||||
|
|
||||||
|
|
|
@ -68,9 +68,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -108,9 +108,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -164,9 +164,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -204,9 +204,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -386,8 +386,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
@ -428,8 +428,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
@ -483,8 +483,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
||||||
|
@ -520,8 +520,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
|
|
@ -59,9 +59,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -99,9 +99,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -145,9 +145,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -185,9 +185,9 @@ LoopH:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v0.4s, v1.4s
|
scvtf v0.4s, v1.4s
|
||||||
scvtf v1.4s, v2.4s
|
scvtf v1.4s, v2.4s
|
||||||
mov v2.4s, v7.4s
|
mov v2.16b, v7.16b
|
||||||
fmla v2.4s, v1.4s, v5.4s
|
fmla v2.4s, v1.4s, v5.4s
|
||||||
mov v1.4s, v6.4s
|
mov v1.16b, v6.16b
|
||||||
fmla v1.4s, v0.4s, v4.4s
|
fmla v1.4s, v0.4s, v4.4s
|
||||||
|
|
||||||
ld1 {v0.4s}, [x15], #16
|
ld1 {v0.4s}, [x15], #16
|
||||||
|
@ -357,8 +357,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
@ -399,8 +399,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
@ -448,8 +448,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
ld1 {v0.4s, v1.4s, v2.4s}, [x15], #48
|
||||||
|
@ -485,8 +485,8 @@ LoopHRemain:
|
||||||
sxtl2 v2.4s, v0.8h
|
sxtl2 v2.4s, v0.8h
|
||||||
scvtf v1.4s, v1.4s
|
scvtf v1.4s, v1.4s
|
||||||
scvtf v2.4s, v2.4s
|
scvtf v2.4s, v2.4s
|
||||||
mov v3.4s, v21.4s
|
mov v3.16b, v21.16b
|
||||||
mov v4.4s, v21.4s
|
mov v4.16b, v21.16b
|
||||||
fmla v3.4s, v1.4s, v20.4s
|
fmla v3.4s, v1.4s, v20.4s
|
||||||
fmla v4.4s, v2.4s, v20.4s
|
fmla v4.4s, v2.4s, v20.4s
|
||||||
|
|
||||||
|
|
|
@ -187,7 +187,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
|
||||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT * mIm2ColCount, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||||
if (nullptr == mBlitInfo.first) {
|
if (mBlitInfo.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
bufferAlloc->free(mBlitInfo);
|
bufferAlloc->free(mBlitInfo);
|
||||||
|
@ -236,7 +236,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
|
||||||
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
|
auto col_buffer_size = col_buffer_unit_size * mIm2ColCount;
|
||||||
auto threadFunction = [&](int tId) {
|
auto threadFunction = [&](int tId) {
|
||||||
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
|
auto colAddr = im2colPtr + tId * mTempIm2ColBuffer->stride(0);
|
||||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||||
|
|
||||||
int32_t info[4];
|
int32_t info[4];
|
||||||
|
|
|
@ -31,7 +31,7 @@ protected:
|
||||||
std::shared_ptr<Tensor> mTempIm2ColBuffer;
|
std::shared_ptr<Tensor> mTempIm2ColBuffer;
|
||||||
std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
|
std::shared_ptr<CPUConvolution::ResourceInt8> mResource;
|
||||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||||
std::pair<void*, int> mBlitInfo;
|
MemChunk mBlitInfo;
|
||||||
std::pair<size_t, size_t> mBlitInfoStride;
|
std::pair<size_t, size_t> mBlitInfoStride;
|
||||||
int mIm2ColCount;
|
int mIm2ColCount;
|
||||||
};
|
};
|
||||||
|
|
|
@ -193,8 +193,9 @@ ErrorCode ConvInt8Winograd::onResize(const std::vector<Tensor *> &inputs, const
|
||||||
}
|
}
|
||||||
for (auto& unit : mUnits) {
|
for (auto& unit : mUnits) {
|
||||||
int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
|
int sy = ALIMAX(unit.kyStart - mPadY, 0), sx = ALIMAX(unit.kxStart - mPadX, 0);
|
||||||
auto srcData = input->host<float>() + (sy * iw + sx) * UNIT;
|
auto srcChunk = TensorUtils::getDescribe(input)->mem->chunk() + (sy * iw + sx) * UNIT;
|
||||||
unit.input.reset(Tensor::create<float>({batch, ic, ih - sy, iw - sx}, srcData, Tensor::CAFFE_C4));
|
unit.input.reset(Tensor::createDevice<float>({batch, ic, ih - sy, iw - sx}, Tensor::CAFFE_C4));
|
||||||
|
TensorUtils::getDescribe(unit.input.get())->mem.reset(new CPUMemObj(nullptr, srcChunk, 0));
|
||||||
for (int i = 0; i < input->dimensions(); ++i) {
|
for (int i = 0; i < input->dimensions(); ++i) {
|
||||||
unit.input->setStride(i, input->stride(i));
|
unit.input->setStride(i, input->stride(i));
|
||||||
}
|
}
|
||||||
|
@ -296,6 +297,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
|
||||||
core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
|
core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / UNIT, inputQuant[1]);
|
||||||
std::vector<Tensor*> tmp_outputs;
|
std::vector<Tensor*> tmp_outputs;
|
||||||
for (auto& unit : mUnits) {
|
for (auto& unit : mUnits) {
|
||||||
|
unit.input->buffer().host = TensorUtils::getDescribe(unit.input.get())->mem->chunk().ptr();
|
||||||
auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
|
auto ret = unit.runner->onExecute({unit.input.get()}, {unit.output.get()});
|
||||||
if (ret != NO_ERROR) {
|
if (ret != NO_ERROR) {
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
#include "ConvOpt.h"
|
#include "ConvOpt.h"
|
||||||
#include "core/Macro.h"
|
#include "core/Macro.h"
|
||||||
#include "CommonOptFunction.h"
|
#include "CommonOptFunction.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
Convolution1x1Strassen::Convolution1x1Strassen(const Convolution2DCommon *common, Backend *b, const float *originWeight,
|
||||||
|
@ -88,8 +89,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
auto matrixSizeE = output->height() * output->width() * input->batch();
|
auto matrixSizeE = output->height() * output->width() * input->batch();
|
||||||
auto outputPlane = output->height() * output->width();
|
auto outputPlane = output->height() * output->width();
|
||||||
mUnits.clear();
|
mUnits.clear();
|
||||||
auto inputPtr = input->host<uint8_t>();
|
auto inputPtr = TensorUtils::getDescribe(input)->mem->chunk();
|
||||||
auto outputPtr = output->host<uint8_t>();
|
auto outputPtr = TensorUtils::getDescribe(output)->mem->chunk();
|
||||||
|
|
||||||
std::shared_ptr<char> __autoFunction;
|
std::shared_ptr<char> __autoFunction;
|
||||||
auto padY = mPadY;
|
auto padY = mPadY;
|
||||||
auto padX = mPadX;
|
auto padX = mPadX;
|
||||||
|
@ -124,9 +126,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
int l = ic;
|
int l = ic;
|
||||||
int h = oc;
|
int h = oc;
|
||||||
auto aPtr = inputPtr + core->pack * planeStart * bytes;
|
auto aPtr = inputPtr + core->pack * planeStart * bytes;
|
||||||
auto bPtr = weightTensor->host<uint8_t>();
|
auto bPtr = TensorUtils::getDescribe(weightTensor)->mem->chunk();;
|
||||||
auto cPtr = outputPtr + core->pack * planeStart * bytes;
|
auto cPtr = outputPtr + core->pack * planeStart * bytes;
|
||||||
auto biasPtr = mResource->mBias->host<uint8_t>();
|
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk();
|
||||||
memoryPool->beginGroup();
|
memoryPool->beginGroup();
|
||||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||||
if (NO_ERROR != code) {
|
if (NO_ERROR != code) {
|
||||||
|
@ -168,9 +170,9 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
|
||||||
int l = ic;
|
int l = ic;
|
||||||
int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
|
int h = std::min(ocSize * core->pack, ocWeightSize * hPack);
|
||||||
auto aPtr = inputPtr;
|
auto aPtr = inputPtr;
|
||||||
auto bPtr = mResource->mWeight->host<uint8_t>() + hPack * icAlign * ocStartWeight * bytes;
|
auto bPtr = TensorUtils::getDescribe(mResource->mWeight.get())->mem->chunk() + hPack * icAlign * ocStartWeight * bytes;
|
||||||
auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
|
auto cPtr = outputPtr + core->pack * matrixSizeE * ocStart * bytes;
|
||||||
auto biasPtr = mResource->mBias->host<uint8_t>() + core->pack * ocStart * bytes;
|
auto biasPtr = TensorUtils::getDescribe(mResource->mBias.get())->mem->chunk() + core->pack * ocStart * bytes;
|
||||||
memoryPool->beginGroup();
|
memoryPool->beginGroup();
|
||||||
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
auto code = unit.mStracssenComputor->onEncode(e, l, h, matrixSizeE * core->pack, UP_DIV(l, lPack) * lPack * hPack, matrixSizeE * core->pack, aPtr, bPtr, cPtr, true, biasPtr, postParameters);
|
||||||
if (NO_ERROR != code) {
|
if (NO_ERROR != code) {
|
||||||
|
|
|
@ -413,7 +413,6 @@ ErrorCode DeconvolutionWithStride::onResize(const std::vector<Tensor*>& inputs,
|
||||||
if (!res) {
|
if (!res) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
|
|
||||||
for (auto& unit : mComputeUnits) {
|
for (auto& unit : mComputeUnits) {
|
||||||
backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
|
backend()->onReleaseBuffer(unit.dstBuffer.get(), Backend::DYNAMIC);
|
||||||
if (unit.winogradInfo.open) {
|
if (unit.winogradInfo.open) {
|
||||||
|
@ -469,6 +468,7 @@ ErrorCode DeconvolutionWithStride::onExecute(const std::vector<Tensor*>& inputs,
|
||||||
auto srcOrigin = input->host<float>();
|
auto srcOrigin = input->host<float>();
|
||||||
auto dstOrigin = output->host<float>();
|
auto dstOrigin = output->host<float>();
|
||||||
|
|
||||||
|
::memset(mSrcBuffer->host<float>(), 0, mSrcBuffer->size());
|
||||||
::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float));
|
::memset(dstOrigin, 0, ow * oh * ocDiv4 * 4 * batchSize * sizeof(float));
|
||||||
auto threadFunction = [&](int threadId) {
|
auto threadFunction = [&](int threadId) {
|
||||||
auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);
|
auto srcTotal = mSrcBuffer->host<float>() + threadId * mSrcBuffer->stride(0);
|
||||||
|
|
|
@ -440,10 +440,8 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
int LRoundupC4 = UP_DIV(LRoundup, unit);
|
int LRoundupC4 = UP_DIV(LRoundup, unit);
|
||||||
auto outputChannel = output->channel();
|
auto outputChannel = output->channel();
|
||||||
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
|
ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParameters, mCommon, input, output, mPadX, mPadY, core, nullptr);
|
||||||
const float *biasPtr = nullptr;
|
|
||||||
if (inputs.size() > 2) {
|
if (inputs.size() > 2) {
|
||||||
bias = inputs[2];
|
bias = inputs[2];
|
||||||
biasPtr = bias->host<float>();
|
|
||||||
}
|
}
|
||||||
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
auto kernelSize = mCommon->kernelX() * mCommon->kernelY();
|
||||||
|
|
||||||
|
@ -467,7 +465,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||||
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
|
auto tempPtr = bufferAlloc->alloc(kernelSize * maxLine * threadNumber * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||||
if (nullptr == tempPtr.first) {
|
if (tempPtr.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||||
|
@ -483,10 +481,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n",
|
MNN_PRINT("dense conv: n:%d, ih:%d, iw:%d, ic:%d, oh:%d, ow:%d, oc:%d, kh:%d, kw:%d, plane:%d, threadNumberFirst:%d, tileCount:%d, ePack:%d, pack::%d, bytes:%d\n",
|
||||||
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes);
|
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, threadNumberFirst, tileCount, eP, unit, bytes);
|
||||||
#endif
|
#endif
|
||||||
|
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
|
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
|
||||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
auto srcPtr = (float const **)(tempPtr.ptr() + 0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||||
0 * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
|
||||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||||
auto weightPtr = weight->host<uint8_t>();
|
auto weightPtr = weight->host<uint8_t>();
|
||||||
|
|
||||||
|
@ -614,10 +611,9 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
|
||||||
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes);
|
batch, src_height, src_width, ic, height, width, outputChannel, kernel_width, kernel_height, plane, tileCount, eP, unit, bytes);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
const float* biasPtr = bias ? bias->host<float>() : nullptr;
|
||||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
||||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||||
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
|
||||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||||
auto weightPtr = weight->host<float>();
|
auto weightPtr = weight->host<float>();
|
||||||
int32_t info[4];
|
int32_t info[4];
|
||||||
|
|
|
@ -91,7 +91,7 @@ ErrorCode GemmInt8Executor::onResize(const std::vector<Tensor *> &inputs, const
|
||||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||||
if (nullptr == mBlitInfo.first) {
|
if (mBlitInfo.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
bufferAlloc->free(mBlitInfo);
|
bufferAlloc->free(mBlitInfo);
|
||||||
|
@ -147,7 +147,7 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
|
||||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||||
info[2] = DST_XUNIT;
|
info[2] = DST_XUNIT;
|
||||||
info[3] = mIm2ColParamter.strideX;
|
info[3] = mIm2ColParamter.strideX;
|
||||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||||
|
|
||||||
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
|
for (int tIndex = tId; tIndex < mTileCnt; tIndex += mThreadNums) {
|
||||||
|
|
|
@ -31,7 +31,7 @@ protected:
|
||||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||||
CPUConvolution::MutableResourceInt8 mMutableResource;
|
CPUConvolution::MutableResourceInt8 mMutableResource;
|
||||||
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
|
decltype(CoreInt8Functions::Int8GemmKernel) mGemmKernel;
|
||||||
std::pair<void*, int> mBlitInfo;
|
MemChunk mBlitInfo;
|
||||||
std::pair<size_t, size_t> mBlitInfoStride;
|
std::pair<size_t, size_t> mBlitInfoStride;
|
||||||
};
|
};
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
|
@ -130,7 +130,7 @@ ErrorCode IdstConvolutionInt8::onResize(const std::vector<Tensor*>& inputs, cons
|
||||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
|
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(DST_XUNIT, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, number);
|
||||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||||
if (nullptr == mBlitInfo.first) {
|
if (mBlitInfo.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
bufferAlloc->free(mBlitInfo);
|
bufferAlloc->free(mBlitInfo);
|
||||||
|
@ -199,7 +199,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
|
||||||
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
|
auto outputOrigin = output->host<float>() + batchIndex * output->stride(0);
|
||||||
auto threadFunction = [&](int tId) {
|
auto threadFunction = [&](int tId) {
|
||||||
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
|
auto colAddr = mTempBuffer.host<int8_t>() + tId * mTempBuffer.buffer().dim[0].stride;
|
||||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||||
|
|
||||||
int32_t info[4];
|
int32_t info[4];
|
||||||
|
|
|
@ -40,7 +40,7 @@ private:
|
||||||
std::vector<float> mPostParameters;
|
std::vector<float> mPostParameters;
|
||||||
// mFakeBias used by GemmKernel
|
// mFakeBias used by GemmKernel
|
||||||
std::shared_ptr<Tensor> mFakeBias;
|
std::shared_ptr<Tensor> mFakeBias;
|
||||||
std::pair<void*, int> mBlitInfo;
|
MemChunk mBlitInfo;
|
||||||
std::pair<size_t, size_t> mBlitInfoStride;
|
std::pair<size_t, size_t> mBlitInfoStride;
|
||||||
};
|
};
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
|
@ -142,6 +142,55 @@ static void _MNNPackC4Int8ForMatMul_ASparse(int8_t* destOrigin, int8_t const** s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||||
|
#ifdef MNN_USE_SSE
|
||||||
|
uint8_t* srcPtr = (uint8_t*)src;
|
||||||
|
uint8_t* dstPtr = (uint8_t*)dst;
|
||||||
|
int offset = 128;
|
||||||
|
#else
|
||||||
|
const int8_t* srcPtr = src;
|
||||||
|
int8_t* dstPtr = dst;
|
||||||
|
int offset = 0;
|
||||||
|
#endif
|
||||||
|
int inpZero = static_cast<int>(params->inputZeroPoint[0]);
|
||||||
|
int outZero = static_cast<int>(params->outputZeroPoint[0]);
|
||||||
|
float inpScale = params->inputScale[0];
|
||||||
|
float outScale = params->outputScale[0];
|
||||||
|
float sum = 0.f;
|
||||||
|
int max_ = static_cast<int>(params->maxValue);
|
||||||
|
int min_ = static_cast<int>(params->minValue);
|
||||||
|
for (int j = 0; j < size; ++j) {
|
||||||
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||||
|
sum += fx;
|
||||||
|
}
|
||||||
|
float mean = sum / size;
|
||||||
|
float square_sum = 0.f;
|
||||||
|
for (int j = 0; j < size; ++j) {
|
||||||
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||||
|
square_sum += (fx - mean) * (fx - mean);
|
||||||
|
}
|
||||||
|
float variable = square_sum / size;
|
||||||
|
variable = 1.f / std::sqrt(variable + epsilon);
|
||||||
|
|
||||||
|
if (gamma && beta) {
|
||||||
|
for (int j = 0; j < size; ++j) {
|
||||||
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||||
|
float fy = (fx - mean) * variable * gamma[j] + beta[j];
|
||||||
|
int sy = fy * outScale + outZero;
|
||||||
|
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
||||||
|
dstPtr[j] = sy + offset;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < size; ++j) {
|
||||||
|
float fx = (srcPtr[j] - inpZero - offset) * inpScale;
|
||||||
|
float fy = (fx - mean) * variable;
|
||||||
|
int sy = roundf(fy * outScale) + outZero;
|
||||||
|
sy = ALIMAX(min_, ALIMIN(sy, max_));
|
||||||
|
dstPtr[j] = sy + offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef MNN_USE_NEON
|
#ifndef MNN_USE_NEON
|
||||||
|
|
||||||
void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
|
void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap) {
|
||||||
|
@ -2056,6 +2105,9 @@ void MNNCoreInt8FunctionInit() {
|
||||||
// pooling
|
// pooling
|
||||||
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
|
gCoreFunc->MNNAvgPoolInt8 = MNNAvgPoolInt8;
|
||||||
gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
|
gCoreFunc->MNNMaxPoolInt8 = MNNMaxPoolInt8;
|
||||||
|
|
||||||
|
// Norm
|
||||||
|
gCoreFunc->MNNNormInt8 = MNNNormInt8;
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
#if defined(__aarch64__)
|
||||||
auto core = MNNGetCoreFunctions();
|
auto core = MNNGetCoreFunctions();
|
||||||
|
|
|
@ -68,6 +68,7 @@ void MNNBinarySqdInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t*
|
||||||
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
void MNNBinaryMaxInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
||||||
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
void MNNBinaryMinInt8(int8_t* outputRaw, const int8_t* inputRaw0, const int8_t* inputRaw1, ssize_t* inputScalesInt32, float* inputScalesFp32, const QuanPrePostParameters* params, size_t elementSize, size_t needBroadcast);
|
||||||
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
|
void MNNScaleAndAddBiasInt8(int8_t* dst, const int8_t* src, const int32_t* bias, const int32_t* alpha, int32_t mShiftBits, ssize_t minValue, ssize_t maxValue, int8_t* inputZeroPoint, int8_t* outputZeroPoint, ssize_t planeNumber, ssize_t biasNumber, ssize_t pack = 4);
|
||||||
|
void MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -102,7 +103,9 @@ struct CoreInt8Functions {
|
||||||
void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
|
void (*MNNMaxPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx);
|
||||||
|
|
||||||
void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
|
void (*MNNAvgPoolInt8)(int8_t* dst, int8_t* src, size_t outputWidth, size_t inputWidth, size_t kernelx, size_t kernely, size_t stridesx, ssize_t paddingx, ssize_t factor);
|
||||||
|
|
||||||
|
// Norm
|
||||||
|
void (*MNNNormInt8)(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||||
};
|
};
|
||||||
void MNNCoreInt8FunctionInit();
|
void MNNCoreInt8FunctionInit();
|
||||||
CoreInt8Functions* MNNGetInt8CoreFunctions();
|
CoreInt8Functions* MNNGetInt8CoreFunctions();
|
||||||
|
|
|
@ -144,7 +144,7 @@ ErrorCode SparseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inpu
|
||||||
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend*>(backend())->getBufferAllocator();
|
||||||
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
auto blitInfoSize = ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParamter.ow, mIm2ColParamter.kernelX * mIm2ColParamter.kernelY, mThreadNums);
|
||||||
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
mBlitInfo = bufferAlloc->alloc(blitInfoSize.first);
|
||||||
if (nullptr == mBlitInfo.first) {
|
if (mBlitInfo.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
bufferAlloc->free(mBlitInfo);
|
bufferAlloc->free(mBlitInfo);
|
||||||
|
@ -193,7 +193,7 @@ ErrorCode SparseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inp
|
||||||
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
|
||||||
info[2] = (int)mSparseQuantParam.eP;
|
info[2] = (int)mSparseQuantParam.eP;
|
||||||
info[3] = mIm2ColParamter.strideX;
|
info[3] = mIm2ColParamter.strideX;
|
||||||
auto srcPtr = (int8_t const **)((uint8_t *)mBlitInfo.first + mBlitInfo.second + tId * mBlitInfoStride.first);
|
auto srcPtr = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
|
||||||
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
auto el = (int32_t *)(srcPtr + mBlitInfoStride.second);
|
||||||
|
|
||||||
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
for (int tIndex = tId; tIndex < mTileCount; tIndex += mThreadNums) {
|
||||||
|
|
|
@ -309,7 +309,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
||||||
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
auto bufferAlloc = static_cast<CPUBackend *>(backend())->getBufferAllocator();
|
||||||
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
auto maxLine = UP_DIV(eP, mIm2ColParameters.ow) + 1;
|
||||||
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
|
auto tempPtr = bufferAlloc->alloc(ConvolutionTiledExecutor::computeBlitInfoSize(eP, mIm2ColParameters.ow, mIm2ColParameters.kernelX * mIm2ColParameters.kernelY, threadNumber).first);
|
||||||
if (nullptr == tempPtr.first) {
|
if (tempPtr.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
backend()->onReleaseBuffer(&mTempBufferTranspose, Backend::DYNAMIC);
|
||||||
|
@ -320,8 +320,7 @@ ErrorCode SparseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& input
|
||||||
|
|
||||||
mFunction.second = [=](int tId) {
|
mFunction.second = [=](int tId) {
|
||||||
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * tId;
|
||||||
auto srcPtr = (float const **)((uint8_t *)tempPtr.first + tempPtr.second +
|
auto srcPtr = (float const **)(tempPtr.ptr() + tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
||||||
tId * kernelSize * maxLine * (4 * sizeof(int32_t) + sizeof(float *)));
|
|
||||||
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
auto el = (int32_t *)(srcPtr + kernelSize * maxLine);
|
||||||
|
|
||||||
int32_t info[4];
|
int32_t info[4];
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
#include "core/AutoStorage.h"
|
#include "core/AutoStorage.h"
|
||||||
#include "core/Macro.h"
|
#include "core/Macro.h"
|
||||||
#include "core/Concurrency.h"
|
#include "core/Concurrency.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
//#define MNN_OPEN_TIME_TRACE
|
//#define MNN_OPEN_TIME_TRACE
|
||||||
#include <MNN/AutoTime.hpp>
|
#include <MNN/AutoTime.hpp>
|
||||||
#include "math/Vec.hpp"
|
#include "math/Vec.hpp"
|
||||||
|
@ -28,15 +29,15 @@ public:
|
||||||
mAllocator = allocator;
|
mAllocator = allocator;
|
||||||
}
|
}
|
||||||
~ AutoMemory() {
|
~ AutoMemory() {
|
||||||
if (nullptr != mContent.first) {
|
if (!mContent.invalid()) {
|
||||||
mAllocator->free(mContent);
|
mAllocator->free(mContent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const std::pair<void*, int>& get() const {
|
const MemChunk& get() const {
|
||||||
return mContent;
|
return mContent;
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
std::pair<void*, int> mContent;
|
MemChunk mContent;
|
||||||
BufferAllocator* mAllocator;
|
BufferAllocator* mAllocator;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -62,15 +63,15 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
|
||||||
auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes;
|
auto bExtraStride = bStride - UP_DIV(l, lP)*lP*hP * core->bytes;
|
||||||
MNN_ASSERT(bExtraStride >= 0);
|
MNN_ASSERT(bExtraStride >= 0);
|
||||||
auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes);
|
auto tileBufferBasic = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(numberThread * UP_DIV(l, lP) * eP * lP * bytes);
|
||||||
if (nullptr == tileBufferBasic.first) {
|
if (tileBufferBasic.invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
auto tileHostOrigin = (uint8_t*)tileBufferBasic.first + tileBufferBasic.second;
|
|
||||||
int unitNumber = e / eP;
|
int unitNumber = e / eP;
|
||||||
int xCount = e - unitNumber * eP;
|
int xCount = e - unitNumber * eP;
|
||||||
auto eReal = aStride / core->bytes / core->pack;
|
auto eReal = aStride / core->bytes / core->pack;
|
||||||
mFunctions.emplace_back(
|
mFunctions.emplace_back(
|
||||||
std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileHostOrigin, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
|
std::make_pair([cStride, l, h, xCount, AT, BT, CT, COT, tileBufferBasic, unitNumber, bExtraStride, numberThread, eReal, eP, active, this](int tId) {
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
size_t parameters[6];
|
size_t parameters[6];
|
||||||
parameters[0] = xCount * core->bytes;
|
parameters[0] = xCount * core->bytes;
|
||||||
|
@ -79,17 +80,17 @@ ErrorCode StrassenMatrixComputor::_generateTrivalMatMul(int e, int l, int h, con
|
||||||
parameters[3] = cStride;
|
parameters[3] = cStride;
|
||||||
parameters[4] = 0;
|
parameters[4] = 0;
|
||||||
parameters[5] = bExtraStride;
|
parameters[5] = bExtraStride;
|
||||||
auto tileHost = tileHostOrigin + eP * parameters[1] * tId * core->bytes;
|
auto tileHost = tileBufferBasic.ptr() + eP * parameters[1] * tId * core->bytes;
|
||||||
const float* postParametersPtr = nullptr;
|
const float* postParametersPtr = nullptr;
|
||||||
if (!active.empty()) {
|
if (!active.empty()) {
|
||||||
postParametersPtr = active.data();
|
postParametersPtr = active.data();
|
||||||
}
|
}
|
||||||
auto aHost = mStack[AT.stackIndex] + AT.offsetBytes;
|
auto aHost = mStack[AT.stackIndex].ptr() + AT.offsetBytes;
|
||||||
auto bHost = mStack[BT.stackIndex] + BT.offsetBytes;
|
auto bHost = mStack[BT.stackIndex].ptr() + BT.offsetBytes;
|
||||||
auto cHost = mStack[CT.stackIndex] + CT.offsetBytes;
|
auto cHost = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||||
const uint8_t* biasPtr = nullptr;
|
const uint8_t* biasPtr = nullptr;
|
||||||
if (-1 != COT.stackIndex) {
|
if (-1 != COT.stackIndex) {
|
||||||
biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||||
}
|
}
|
||||||
auto packUnit = core->bytes * core->pack;
|
auto packUnit = core->bytes * core->pack;
|
||||||
int32_t info[4];
|
int32_t info[4];
|
||||||
|
@ -166,7 +167,7 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
||||||
CTemp.stackIndex = (int)mStack.size();
|
CTemp.stackIndex = (int)mStack.size();
|
||||||
CTemp.offsetBytes = 0;
|
CTemp.offsetBytes = 0;
|
||||||
CTemp.lineStrideBytes = e * core->bytes * core->pack;
|
CTemp.lineStrideBytes = e * core->bytes * core->pack;
|
||||||
mStack.emplace_back((uint8_t*)CAddr.get().first + CAddr.get().second);
|
mStack.emplace_back(CAddr.get());
|
||||||
|
|
||||||
MatrixInfo Empty;
|
MatrixInfo Empty;
|
||||||
Empty.stackIndex = -1;
|
Empty.stackIndex = -1;
|
||||||
|
@ -197,8 +198,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
||||||
}
|
}
|
||||||
// Add CTemp to C
|
// Add CTemp to C
|
||||||
auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) {
|
auto f1 = [CT, CTemp, e, cHeight, numberThread, core, this](int tId) {
|
||||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||||
auto xAddr = mStack[CTemp.stackIndex] + CTemp.offsetBytes;
|
auto xAddr = mStack[CTemp.stackIndex].ptr() + CTemp.offsetBytes;
|
||||||
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core);
|
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, e, CT.lineStrideBytes, CT.lineStrideBytes, CTemp.lineStrideBytes, cHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
||||||
|
@ -206,10 +207,10 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
||||||
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
||||||
if (1 == numberThread) {
|
if (1 == numberThread) {
|
||||||
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||||
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
|
auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
|
||||||
auto width = e;
|
auto width = e;
|
||||||
auto height = cHeight;
|
auto height = cHeight;
|
||||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||||
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data());
|
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, CT.lineStrideBytes / core->bytes, CT.lineStrideBytes / core->bytes, height, postParameters.data());
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(postFunction, 1));
|
mFunctions.emplace_back(std::make_pair(postFunction, 1));
|
||||||
|
@ -217,8 +218,8 @@ ErrorCode StrassenMatrixComputor::_generateBasicMatMul(int e, int l, int h, cons
|
||||||
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
auto postFunction = [CT, COT, e, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||||
auto width = e;
|
auto width = e;
|
||||||
auto height = cHeight;
|
auto height = cHeight;
|
||||||
auto c11Ptr = mStack[CT.stackIndex] + CT.offsetBytes;
|
auto c11Ptr = mStack[CT.stackIndex].ptr() + CT.offsetBytes;
|
||||||
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||||
for (int y = tId; y < height; y+=numberThread) {
|
for (int y = tId; y < height; y+=numberThread) {
|
||||||
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * CT.lineStrideBytes), (float*)(c11Ptr + y * CT.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
||||||
}
|
}
|
||||||
|
@ -278,19 +279,19 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
auto maxlH = std::max(lSub, hSub);
|
auto maxlH = std::max(lSub, hSub);
|
||||||
AutoMemory YAddr(hSub * lSub * core->bytes, allocator);
|
AutoMemory YAddr(hSub * lSub * core->bytes, allocator);
|
||||||
AutoMemory XAddr(maxlH * eSub * core->bytes, allocator);
|
AutoMemory XAddr(maxlH * eSub * core->bytes, allocator);
|
||||||
if (nullptr == XAddr.get().first || nullptr == YAddr.get().first) {
|
if (XAddr.get().invalid() || YAddr.get().invalid()) {
|
||||||
return OUT_OF_MEMORY;
|
return OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
MatrixInfo Y;
|
MatrixInfo Y;
|
||||||
Y.stackIndex = (int)mStack.size();
|
Y.stackIndex = (int)mStack.size();
|
||||||
mStack.emplace_back((uint8_t*)YAddr.get().first + YAddr.get().second);
|
mStack.emplace_back(YAddr.get());
|
||||||
Y.offsetBytes = 0;
|
Y.offsetBytes = 0;
|
||||||
Y.lineStrideBytes = lSub * core->bytes * hP;
|
Y.lineStrideBytes = lSub * core->bytes * hP;
|
||||||
MatrixInfo X;
|
MatrixInfo X;
|
||||||
X.stackIndex = (int)mStack.size();
|
X.stackIndex = (int)mStack.size();
|
||||||
X.offsetBytes = 0;
|
X.offsetBytes = 0;
|
||||||
X.lineStrideBytes = eSub * core->bytes * core->pack;
|
X.lineStrideBytes = eSub * core->bytes * core->pack;
|
||||||
mStack.emplace_back((uint8_t*)XAddr.get().first + XAddr.get().second);
|
mStack.emplace_back(XAddr.get());
|
||||||
|
|
||||||
MatrixInfo CX;
|
MatrixInfo CX;
|
||||||
CX.stackIndex = X.stackIndex;
|
CX.stackIndex = X.stackIndex;
|
||||||
|
@ -327,12 +328,12 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
{
|
{
|
||||||
// S3=A11-A21, T3=B22-B12, P7=S3*T3
|
// S3=A11-A21, T3=B22-B12, P7=S3*T3
|
||||||
auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) {
|
auto f = [a11, a21, b22, b12, X, Y, eSub, lSub, hSub, numberThread, core, hP, this, bWidth, aHeight, bHeight](int tId) {
|
||||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||||
auto a11Ptr = mStack[a11.stackIndex] + a11.offsetBytes;
|
auto a11Ptr = mStack[a11.stackIndex].ptr() + a11.offsetBytes;
|
||||||
auto a21Ptr = mStack[a21.stackIndex] + a21.offsetBytes;
|
auto a21Ptr = mStack[a21.stackIndex].ptr() + a21.offsetBytes;
|
||||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(xAddr, a11Ptr, a21Ptr, eSub, X.lineStrideBytes, a11.lineStrideBytes, a21.lineStrideBytes, aHeight, core);
|
||||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, b12.lineStrideBytes, bHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c21, Empty, currentDepth, {});
|
||||||
|
@ -343,8 +344,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
{
|
{
|
||||||
// S1=A21+A22, T1=B12-B11, P5=S1T1
|
// S1=A21+A22, T1=B12-B11, P5=S1T1
|
||||||
auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
auto f = [a22, a21, b11, b12, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
||||||
MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex] + X.offsetBytes, mStack[a21.stackIndex] + a21.offsetBytes, mStack[a22.stackIndex] + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
|
MNNMATRIX_ADD_MULTITHREAD(mStack[X.stackIndex].ptr() + X.offsetBytes, mStack[a21.stackIndex].ptr() + a21.offsetBytes, mStack[a22.stackIndex].ptr() + a22.offsetBytes , eSub, X.lineStrideBytes, a21.lineStrideBytes, a22.lineStrideBytes, aHeight, core);
|
||||||
MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex] + Y.offsetBytes, mStack[b12.stackIndex] + b12.offsetBytes, mStack[b11.stackIndex] + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(mStack[Y.stackIndex].ptr() + Y.offsetBytes, mStack[b12.stackIndex].ptr() + b12.offsetBytes, mStack[b11.stackIndex].ptr() + b11.offsetBytes, bWidth, Y.lineStrideBytes, b12.lineStrideBytes, b11.lineStrideBytes, bHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c22, Empty, currentDepth, {});
|
||||||
|
@ -355,10 +356,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
{
|
{
|
||||||
// S2=S1-A11, T2=B22-T1, P6=S2T2
|
// S2=S1-A11, T2=B22-T1, P6=S2T2
|
||||||
auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
auto f = [a11, b22, X, Y, eSub, lSub, hSub, numberThread, hP, core, this, bWidth, aHeight, bHeight](int tId) {
|
||||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex] + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(xAddr, xAddr, mStack[a11.stackIndex].ptr() + a11.offsetBytes, eSub, X.lineStrideBytes, X.lineStrideBytes, a11.lineStrideBytes, aHeight, core);
|
||||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex] + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(yAddr, mStack[b22.stackIndex].ptr() + b22.offsetBytes, yAddr, bWidth, Y.lineStrideBytes, b22.lineStrideBytes, Y.lineStrideBytes, bHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, X, Y, c12, Empty, currentDepth, {});
|
||||||
|
@ -369,8 +370,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
{
|
{
|
||||||
// S4=A12-S2, P3=S4*B22, P1=A11*B11
|
// S4=A12-S2, P3=S4*B22, P1=A11*B11
|
||||||
auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) {
|
auto f = [a12, X, eSub, aHeight, numberThread, core, this](int tId) {
|
||||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||||
MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex] + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(xAddr, mStack[a12.stackIndex].ptr() + a12.offsetBytes, xAddr, eSub, X.lineStrideBytes, a12.lineStrideBytes, X.lineStrideBytes, aHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, X, b22, c11, Empty, currentDepth, {});
|
||||||
|
@ -387,10 +388,10 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
// U5=U4+P3, T4=T2-B21, P4=A22*T4
|
// U5=U4+P3, T4=T2-B21, P4=A22*T4
|
||||||
auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) {
|
auto f = [c11, c12, c21, c22, b21, X, Y, eSub, bWidth, cHeight, bHeight, numberThread, core, this](int tId) {
|
||||||
for (int y = tId; y < cHeight; y+=numberThread) {
|
for (int y = tId; y < cHeight; y+=numberThread) {
|
||||||
core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex] + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex] + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex] + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex] + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex] + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
|
core->MNNStrassenMergeCFunction((float*)(mStack[c11.stackIndex].ptr() + c11.offsetBytes + y * c11.lineStrideBytes), (float*)(mStack[c12.stackIndex].ptr() + c12.offsetBytes + y * c12.lineStrideBytes), (float*)(mStack[c21.stackIndex].ptr() + c21.offsetBytes + y * c21.lineStrideBytes), (float*)(mStack[c22.stackIndex].ptr() + c22.offsetBytes + y * c22.lineStrideBytes), (float*)(mStack[X.stackIndex].ptr() + X.offsetBytes + y * X.lineStrideBytes), 0, eSub, 1);
|
||||||
}
|
}
|
||||||
auto yAddr = mStack[Y.stackIndex] + Y.offsetBytes;
|
auto yAddr = mStack[Y.stackIndex].ptr() + Y.offsetBytes;
|
||||||
MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex] + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(yAddr, yAddr, mStack[b21.stackIndex].ptr() + b21.offsetBytes, bWidth, Y.lineStrideBytes, Y.lineStrideBytes, b21.lineStrideBytes, bHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
mFunctions.emplace_back(std::make_pair(f, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, a22, Y, c11, Empty, currentDepth, {});
|
||||||
|
@ -402,8 +403,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
// U6=U3-P4, P2=A12*B21, U1=P1+P2
|
// U6=U3-P4, P2=A12*B21, U1=P1+P2
|
||||||
auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) {
|
auto f0 = [c11, c21, eSub, cHeight, numberThread, core, this](int tId) {
|
||||||
auto cw = eSub;
|
auto cw = eSub;
|
||||||
auto c21Addr = mStack[c21.stackIndex] + c21.offsetBytes;
|
auto c21Addr = mStack[c21.stackIndex].ptr() + c21.offsetBytes;
|
||||||
MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex] + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
|
MNNMATRIX_SUB_MULTITHREAD(c21Addr, c21Addr, mStack[c11.stackIndex].ptr() + c11.offsetBytes, cw, c21.lineStrideBytes, c21.lineStrideBytes, c11.lineStrideBytes, cHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f0, numberThread));
|
mFunctions.emplace_back(std::make_pair(f0, numberThread));
|
||||||
auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {});
|
auto code = _generateMatMul(eSub, lSub, hSub, a12, b21, c11, Empty, currentDepth, {});
|
||||||
|
@ -412,18 +413,18 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
}
|
}
|
||||||
auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) {
|
auto f1 = [c11, X, eSub, cHeight, numberThread, core, this](int tId) {
|
||||||
auto cw = eSub;
|
auto cw = eSub;
|
||||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||||
auto xAddr = mStack[X.stackIndex] + X.offsetBytes;
|
auto xAddr = mStack[X.stackIndex].ptr() + X.offsetBytes;
|
||||||
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core);
|
MNNMATRIX_ADD_MULTITHREAD(c11Ptr, c11Ptr, xAddr, cw, c11.lineStrideBytes, c11.lineStrideBytes, X.lineStrideBytes, cHeight, core);
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
mFunctions.emplace_back(std::make_pair(f1, numberThread));
|
||||||
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
if (!postParameters.empty() && COT.stackIndex >= 0) {
|
||||||
if (1 == numberThread) {
|
if (1 == numberThread) {
|
||||||
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||||
auto biasPtr = (const float*)(mStack[COT.stackIndex] + COT.offsetBytes);
|
auto biasPtr = (const float*)(mStack[COT.stackIndex].ptr() + COT.offsetBytes);
|
||||||
auto width = eSub * 2;
|
auto width = eSub * 2;
|
||||||
auto height = cHeight * 2;
|
auto height = cHeight * 2;
|
||||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||||
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data());
|
core->MNNAxByClampBroadcastUnit((float*)c11Ptr, (float*)c11Ptr, biasPtr, width, c11.lineStrideBytes / core->bytes, c11.lineStrideBytes / core->bytes, height, postParameters.data());
|
||||||
};
|
};
|
||||||
mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
|
mFunctions.emplace_back(std::make_pair(postFunction, numberThread));
|
||||||
|
@ -431,8 +432,8 @@ ErrorCode StrassenMatrixComputor::_generateMatMul(int e, int l, int h, const Mat
|
||||||
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
auto postFunction = [c11, COT, eSub, cHeight, numberThread, postParameters, core, this](int tId) {
|
||||||
auto width = eSub * 2;
|
auto width = eSub * 2;
|
||||||
auto height = cHeight * 2;
|
auto height = cHeight * 2;
|
||||||
auto c11Ptr = mStack[c11.stackIndex] + c11.offsetBytes;
|
auto c11Ptr = mStack[c11.stackIndex].ptr() + c11.offsetBytes;
|
||||||
auto biasPtr = mStack[COT.stackIndex] + COT.offsetBytes;
|
auto biasPtr = mStack[COT.stackIndex].ptr() + COT.offsetBytes;
|
||||||
for (int y = tId; y < height; y+=numberThread) {
|
for (int y = tId; y < height; y+=numberThread) {
|
||||||
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
core->MNNAxByClampBroadcastUnit((float*)(c11Ptr + y * c11.lineStrideBytes), (float*)(c11Ptr + y * c11.lineStrideBytes), (const float*)(biasPtr + y * core->bytes * core->pack), width, 0, 0, 1, postParameters.data());
|
||||||
}
|
}
|
||||||
|
@ -496,25 +497,25 @@ ErrorCode StrassenMatrixComputor::onEncode(const std::vector<Tensor*>& inputs, c
|
||||||
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
core->MNNGetMatMulPackMode(&eP, &lP, &hP);
|
||||||
int bs = UP_DIV(l, lP) * lP * hP;
|
int bs = UP_DIV(l, lP) * lP * hP;
|
||||||
int cs = C->stride(0);
|
int cs = C->stride(0);
|
||||||
uint8_t* bias = nullptr;
|
MemChunk bias;
|
||||||
bool useBias = false;
|
bool useBias = false;
|
||||||
if (inputs.size() > 2) {
|
if (inputs.size() > 2) {
|
||||||
bias = inputs[2]->host<uint8_t>();
|
bias = TensorUtils::getDescribe(inputs[2])->mem->chunk();
|
||||||
useBias = true;
|
useBias = true;
|
||||||
}
|
}
|
||||||
return onEncode(e, l, h, as, bs, cs, A->host<uint8_t>(), B->host<uint8_t>(), C->host<uint8_t>(), useBias, bias, postParameters);
|
return onEncode(e, l, h, as, bs, cs, TensorUtils::getDescribe(A)->mem->chunk(), TensorUtils::getDescribe(B)->mem->chunk(), TensorUtils::getDescribe(C)->mem->chunk(), useBias, bias, postParameters);
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias, const std::vector<float>& postParameters) {
|
ErrorCode StrassenMatrixComputor::onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias, const std::vector<float>& postParameters) {
|
||||||
auto core = static_cast<CPUBackend*>(backend())->functions();
|
auto core = static_cast<CPUBackend*>(backend())->functions();
|
||||||
MatrixInfo a,b,c,bias;
|
MatrixInfo a,b,c,bias;
|
||||||
bias.stackIndex = -1;
|
bias.stackIndex = -1;
|
||||||
mFunctions.clear();
|
mFunctions.clear();
|
||||||
mStack = {(uint8_t*)AT, (uint8_t*)BT, CT};
|
mStack = {AT, BT, CT};
|
||||||
if (useBias) {
|
if (useBias) {
|
||||||
bias.stackIndex = 3;
|
bias.stackIndex = 3;
|
||||||
bias.offsetBytes = 0;
|
bias.offsetBytes = 0;
|
||||||
mStack.emplace_back((uint8_t*)Bias);
|
mStack.emplace_back(Bias);
|
||||||
}
|
}
|
||||||
a.stackIndex = 0;
|
a.stackIndex = 0;
|
||||||
a.lineStrideBytes = as * core->bytes;
|
a.lineStrideBytes = as * core->bytes;
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#define StrassenMatmulComputor_hpp
|
#define StrassenMatmulComputor_hpp
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
#include "core/Backend.hpp"
|
#include "core/Backend.hpp"
|
||||||
namespace MNN {
|
namespace MNN {
|
||||||
/**
|
/**
|
||||||
|
@ -53,8 +54,9 @@ public:
|
||||||
*/
|
*/
|
||||||
ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0);
|
ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const std::vector<float>& postParameters = {}, int l = 0, int h = 0);
|
||||||
|
|
||||||
ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
|
ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const MemChunk AT, const MemChunk BT, MemChunk CT, bool useBias, const MemChunk Bias = MemChunk(), const std::vector<float>& postParameters = {});
|
||||||
|
// ErrorCode onEncode(int e, int l, int h, int as, int bs, int cs, const uint8_t* AT, const uint8_t* BT, uint8_t* CT, bool useBias, const uint8_t* Bias = nullptr, const std::vector<float>& postParameters = {});
|
||||||
|
|
||||||
void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr);
|
void onExecute(const uint8_t* AT = nullptr, const uint8_t* BT = nullptr, const uint8_t* COT = nullptr, uint8_t* CT = nullptr);
|
||||||
|
|
||||||
void onReset();
|
void onReset();
|
||||||
|
@ -79,7 +81,7 @@ private:
|
||||||
|
|
||||||
Backend* mBackend;
|
Backend* mBackend;
|
||||||
|
|
||||||
std::vector<uint8_t*> mStack;
|
std::vector<MemChunk> mStack;
|
||||||
};
|
};
|
||||||
} // namespace MNN
|
} // namespace MNN
|
||||||
|
|
||||||
|
|
|
@ -124,6 +124,7 @@ void MNNInt8FunctionInit() {
|
||||||
auto core = MNN::MNNGetInt8CoreFunctions();
|
auto core = MNN::MNNGetInt8CoreFunctions();
|
||||||
core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
|
core->MNNAvgPoolInt8 = MNNAvgPoolUint8;
|
||||||
core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
|
core->MNNMaxPoolInt8 = MNNMaxPoolInt8_;
|
||||||
|
core->MNNNormInt8 = _SSE_MNNNormInt8;
|
||||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||||
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
|
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
|
||||||
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
|
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
|
||||||
|
|
|
@ -75,6 +75,7 @@ void _AVX_WinogradInit(void* functions);
|
||||||
|
|
||||||
void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters);
|
void _AVX_MNNGelu(float *dst, const float *src, size_t size, float* parameters);
|
||||||
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||||
|
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||||
|
|
||||||
void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP);
|
void _AVX_MNNGetSparseMatMulPackMode(int* eP, int *lP, int* hP);
|
||||||
void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
|
void _AVX_MNNPackedSparseMatMulEpx1EFMA(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, unsigned int* NNZMap, int* dataOffsetMap);
|
||||||
|
|
|
@ -754,4 +754,7 @@ void _AVX_MNNInt8FunctionInit(void* functions) {
|
||||||
|
|
||||||
// conv depthwise
|
// conv depthwise
|
||||||
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
|
gAVX2CoreInt8Functions->ConvDepthwiseLineInt8 = _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit;
|
||||||
|
|
||||||
|
// Norm
|
||||||
|
gAVX2CoreInt8Functions->MNNNormInt8 = _AVX_MNNNormInt8;
|
||||||
}
|
}
|
||||||
|
|
|
@ -202,7 +202,7 @@ void _AVX_MNNSoftmax(float* dest, const float* source, size_t size) {
|
||||||
|
|
||||||
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
||||||
float tmpfloat8[8];
|
float tmpfloat8[8];
|
||||||
int count = size / 8;
|
int count = static_cast<int32_t>(size / 8);
|
||||||
int remain = count * 8;
|
int remain = count * 8;
|
||||||
// step 1: get sum
|
// step 1: get sum
|
||||||
float sum = 0.f;
|
float sum = 0.f;
|
||||||
|
@ -263,4 +263,79 @@ void _AVX_MNNNorm(float *dst, const float *src, const float *gamma, const float
|
||||||
dst[i] = (src[i] - mean) * variable;
|
dst[i] = (src[i] - mean) * variable;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void _AVX_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||||
|
float tmpfloat8[8];
|
||||||
|
int count = static_cast<int32_t>(size / 8);
|
||||||
|
int remain = count * 8;
|
||||||
|
std::vector<float> inpf(size);
|
||||||
|
std::vector<float> outf(size);
|
||||||
|
std::vector<float> inpScale(4, params->inputScale[0]);
|
||||||
|
std::vector<float> outScale(4, params->outputScale[0]);
|
||||||
|
float* srcf = inpf.data();
|
||||||
|
float* dstf = outf.data();
|
||||||
|
// step 0: Int8 -> Float
|
||||||
|
_AVX_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
|
||||||
|
// step 1: get sum
|
||||||
|
float sum = 0.f;
|
||||||
|
if (count > 0) {
|
||||||
|
auto sumVal = _mm256_set1_ps(0.f);
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
sumVal = _mm256_add_ps(sumVal, _mm256_loadu_ps(srcf + i * 8));
|
||||||
|
}
|
||||||
|
_mm256_storeu_ps(tmpfloat8, sumVal);
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
sum += tmpfloat8[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
sum += srcf[i];
|
||||||
|
}
|
||||||
|
// step 2: get square_sum
|
||||||
|
float mean = sum / size;
|
||||||
|
float square_sum = 0.f;
|
||||||
|
auto meanVal = _mm256_set1_ps(mean);
|
||||||
|
if (count > 0) {
|
||||||
|
auto sumVal = _mm256_set1_ps(0.f);
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||||
|
sumVal = _mm256_add_ps(sumVal, _mm256_mul_ps(x, x));
|
||||||
|
}
|
||||||
|
_mm256_storeu_ps(tmpfloat8, sumVal);
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
square_sum += tmpfloat8[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
float x = (srcf[i] - mean);
|
||||||
|
square_sum += x * x;
|
||||||
|
}
|
||||||
|
// step 3: get result
|
||||||
|
float variable = square_sum / size;
|
||||||
|
variable = 1.f / sqrt(variable + epsilon);
|
||||||
|
auto variableVal = _mm256_set1_ps(variable);
|
||||||
|
if (gamma && beta) {
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||||
|
auto g = _mm256_loadu_ps(gamma + i * 8);
|
||||||
|
auto b = _mm256_loadu_ps(beta + i * 8);
|
||||||
|
auto y = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(x, g), variableVal), b);
|
||||||
|
_mm256_storeu_ps(dstf + i * 8, y);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
dstf[i] = (srcf[i] - mean) * gamma[i] * variable + beta[i] ;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm256_sub_ps(_mm256_loadu_ps(srcf + i * 8), meanVal);
|
||||||
|
auto y = _mm256_mul_ps(x, variableVal);
|
||||||
|
_mm256_storeu_ps(dstf + i * 8, y);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
dstf[i] = (srcf[i] - mean) * variable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// step 4: Float -> Int8
|
||||||
|
_AVX_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
|
||||||
}
|
}
|
|
@ -79,6 +79,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
|
||||||
void _SSE_ExtraInit(void* functions);
|
void _SSE_ExtraInit(void* functions);
|
||||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
|
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
|
||||||
|
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params);
|
||||||
|
|
||||||
/* Image process functions */
|
/* Image process functions */
|
||||||
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
|
void _SSE_MNNRGBAToBGRA(const unsigned char* source, unsigned char* dest, size_t count);
|
||||||
|
|
|
@ -58,7 +58,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
|
||||||
|
|
||||||
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
|
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
|
||||||
float tmpfloat4[4];
|
float tmpfloat4[4];
|
||||||
int count = size / 4;
|
int count = static_cast<int32_t>(size / 4);
|
||||||
int remain = count * 4;
|
int remain = count * 4;
|
||||||
// step 1: get maxValue
|
// step 1: get maxValue
|
||||||
float maxValue = source[0];
|
float maxValue = source[0];
|
||||||
|
@ -212,7 +212,7 @@ void _SSE_MNNHardSwish(float* dst, const float* src, size_t size) {
|
||||||
|
|
||||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size) {
|
||||||
float tmpfloat4[4];
|
float tmpfloat4[4];
|
||||||
int count = size / 4;
|
int count = static_cast<int32_t>(size / 4);
|
||||||
int remain = count * 4;
|
int remain = count * 4;
|
||||||
// step 1: get sum
|
// step 1: get sum
|
||||||
float sum = 0.f;
|
float sum = 0.f;
|
||||||
|
@ -270,3 +270,74 @@ void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void _SSE_MNNNormInt8(int8_t* dst, const int8_t* src, const float* gamma, const float* beta, float epsilon, size_t size, QuanPrePostParameters* params) {
|
||||||
|
float tmpfloat4[4];
|
||||||
|
int count = static_cast<int32_t>(size / 4);
|
||||||
|
int remain = count * 4;
|
||||||
|
float sum = 0.f;
|
||||||
|
std::vector<float> inpf(size);
|
||||||
|
std::vector<float> outf(size);
|
||||||
|
std::vector<float> inpScale(4, params->inputScale[0]);
|
||||||
|
std::vector<float> outScale(4, params->outputScale[0]);
|
||||||
|
float* srcf = inpf.data();
|
||||||
|
float* dstf = outf.data();
|
||||||
|
// step 0: Int8 -> Float
|
||||||
|
_SSE_MNNInt8ScaleToFloat(inpf.data(), src, inpScale.data(), size / 4, params->inputZeroPoint[0]);
|
||||||
|
// step 1: get sum
|
||||||
|
if (count > 0) {
|
||||||
|
auto sumVal = _mm_set1_ps(0.f);
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
sumVal = _mm_add_ps(sumVal, _mm_loadu_ps(srcf + i * 4));
|
||||||
|
}
|
||||||
|
_mm_storeu_ps(tmpfloat4, sumVal);
|
||||||
|
sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
sum += srcf[i];
|
||||||
|
}
|
||||||
|
// step 2: get square_sum
|
||||||
|
float mean = sum / size;
|
||||||
|
float square_sum = 0.f;
|
||||||
|
auto meanVal = _mm_set1_ps(mean);
|
||||||
|
if (count > 0) {
|
||||||
|
auto sumVal = _mm_set1_ps(0.f);
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||||
|
sumVal = _mm_add_ps(sumVal, _mm_mul_ps(x, x));
|
||||||
|
}
|
||||||
|
_mm_storeu_ps(tmpfloat4, sumVal);
|
||||||
|
square_sum += (tmpfloat4[0] + tmpfloat4[1] + tmpfloat4[2] + tmpfloat4[3]);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
float x = (srcf[i] - mean);
|
||||||
|
square_sum += x * x;
|
||||||
|
}
|
||||||
|
// step 3: get result
|
||||||
|
float variable = square_sum / size;
|
||||||
|
variable = 1.f / sqrt(variable + epsilon);
|
||||||
|
auto variableVal = _mm_set1_ps(variable);
|
||||||
|
if (gamma && beta) {
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||||
|
auto g = _mm_loadu_ps(gamma + i * 4);
|
||||||
|
auto b = _mm_loadu_ps(beta + i * 4);
|
||||||
|
auto y = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(x, g), variableVal), b);
|
||||||
|
_mm_storeu_ps(dstf + i * 4, y);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
dstf[i] = (src[i] - mean) * gamma[i] * variable + beta[i] ;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
auto x = _mm_sub_ps(_mm_loadu_ps(srcf + i * 4), meanVal);
|
||||||
|
auto y = _mm_mul_ps(x, variableVal);
|
||||||
|
_mm_storeu_ps(dstf + i * 4, y);
|
||||||
|
}
|
||||||
|
for (int i = remain; i < size; i++) {
|
||||||
|
dstf[i] = (srcf[i] - mean) * variable;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// step 4: Float -> Int8
|
||||||
|
_SSE_MNNFloat2Int8(dstf, dst, size / 4, outScale.data(), params->minValue, params->maxValue, params->outputZeroPoint[0]);
|
||||||
|
}
|
||||||
|
|
|
@ -37,10 +37,10 @@ public:
|
||||||
// Do nothing
|
// Do nothing
|
||||||
}
|
}
|
||||||
virtual ~ CUDARuntimeAllocator() = default;
|
virtual ~ CUDARuntimeAllocator() = default;
|
||||||
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
|
virtual MemChunk onAlloc(size_t size, size_t align) override {
|
||||||
return std::make_pair(mRuntime->alloc(size), 0);
|
return MemChunk(mRuntime->alloc(size), 0);
|
||||||
}
|
}
|
||||||
virtual void onRelease(std::pair<void*, size_t> ptr) override {
|
virtual void onRelease(MemChunk ptr) override {
|
||||||
mRuntime->free(ptr.first);
|
mRuntime->free(ptr.first);
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
|
@ -58,7 +58,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
|
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
|
||||||
mBufferPool.reset(new BufferAllocator(allocator));
|
mBufferPool.reset(new EagerBufferAllocator(allocator));
|
||||||
}
|
}
|
||||||
mDefaultPrecision = precision;
|
mDefaultPrecision = precision;
|
||||||
}
|
}
|
||||||
|
@ -103,7 +103,7 @@ CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
|
||||||
#ifdef LOG_VERBOSE
|
#ifdef LOG_VERBOSE
|
||||||
MNN_PRINT("cuda backend create\n");
|
MNN_PRINT("cuda backend create\n");
|
||||||
#endif
|
#endif
|
||||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
mBufferPool.reset(new EagerBufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
||||||
mStaticBufferPool = st;
|
mStaticBufferPool = st;
|
||||||
mCUDARuntime = rt;
|
mCUDARuntime = rt;
|
||||||
mUseFp16AsFp32 = (precision == 2);
|
mUseFp16AsFp32 = (precision == 2);
|
||||||
|
@ -139,16 +139,19 @@ int CUDABackend::getPrecision() const {
|
||||||
|
|
||||||
class CUDAMemObj : public Backend::MemObj {
|
class CUDAMemObj : public Backend::MemObj {
|
||||||
public:
|
public:
|
||||||
CUDAMemObj(BufferAllocator* allocator, std::pair<void*, int> points) {
|
CUDAMemObj(BufferAllocator* allocator, MemChunk points) {
|
||||||
mPoint = std::move(points);
|
mPoint = std::move(points);
|
||||||
mAllocator = allocator;
|
mAllocator = allocator;
|
||||||
}
|
}
|
||||||
virtual ~ CUDAMemObj() {
|
virtual ~ CUDAMemObj() {
|
||||||
mAllocator->free(mPoint);
|
mAllocator->free(mPoint);
|
||||||
}
|
}
|
||||||
|
MemChunk chunk() override {
|
||||||
|
return mPoint;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
BufferAllocator* mAllocator;
|
BufferAllocator* mAllocator;
|
||||||
std::pair<void*, int> mPoint;
|
MemChunk mPoint;
|
||||||
};
|
};
|
||||||
int CUDABackend::getBytes(const Tensor* tensor) const {
|
int CUDABackend::getBytes(const Tensor* tensor) const {
|
||||||
auto bytes = tensor->getType().bytes();
|
auto bytes = tensor->getType().bytes();
|
||||||
|
@ -176,7 +179,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
|
||||||
auto bytes = getBytes(nativeTensor);
|
auto bytes = getBytes(nativeTensor);
|
||||||
size_t mallocSize = realSize(nativeTensor) * bytes;
|
size_t mallocSize = realSize(nativeTensor) * bytes;
|
||||||
|
|
||||||
std::pair<void*, int> buffer;
|
MemChunk buffer;
|
||||||
if (storageType == DYNAMIC_SEPERATE) {
|
if (storageType == DYNAMIC_SEPERATE) {
|
||||||
buffer = mBufferPool->alloc(mallocSize, true);
|
buffer = mBufferPool->alloc(mallocSize, true);
|
||||||
allocator = mBufferPool.get();
|
allocator = mBufferPool.get();
|
||||||
|
@ -191,7 +194,7 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
|
||||||
if(nullptr == buffer.first) {
|
if(nullptr == buffer.first) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
};
|
};
|
||||||
auto host = (uint8_t*)buffer.first + buffer.second;
|
auto host = buffer.ptr();
|
||||||
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
|
((Tensor*)nativeTensor)->buffer().device = (uint64_t)host;
|
||||||
auto des = TensorUtils::getDescribe(nativeTensor);
|
auto des = TensorUtils::getDescribe(nativeTensor);
|
||||||
des->extra.offset = buffer.second;
|
des->extra.offset = buffer.second;
|
||||||
|
@ -380,7 +383,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
||||||
auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1);
|
auto dstDevice = (dstTensor->deviceId() != 0 && dstTensor->deviceId() != 1);
|
||||||
MNN_ASSERT(srcDevice || dstDevice);
|
MNN_ASSERT(srcDevice || dstDevice);
|
||||||
uint8_t* srcPtr = nullptr;
|
uint8_t* srcPtr = nullptr;
|
||||||
std::pair<void*, int> tempSrcStorage;
|
MemChunk tempSrcStorage;
|
||||||
auto bytes = getBytes(srcTensor);
|
auto bytes = getBytes(srcTensor);
|
||||||
auto type = srcTensor->getType();
|
auto type = srcTensor->getType();
|
||||||
|
|
||||||
|
@ -434,18 +437,18 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
||||||
if (!srcDevice) {
|
if (!srcDevice) {
|
||||||
auto cpuSize = srcTensor->size();
|
auto cpuSize = srcTensor->size();
|
||||||
tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
|
tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
|
||||||
srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
|
srcPtr = tempSrcStorage.ptr();
|
||||||
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
|
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
|
||||||
true);
|
true);
|
||||||
} else {
|
} else {
|
||||||
srcPtr = (uint8_t*)srcTensor->deviceId();
|
srcPtr = (uint8_t*)srcTensor->deviceId();
|
||||||
}
|
}
|
||||||
uint8_t* dstPtr = nullptr;
|
uint8_t* dstPtr = nullptr;
|
||||||
std::pair<void*, int> tempDstStorage;
|
MemChunk tempDstStorage;
|
||||||
if (!dstDevice) {
|
if (!dstDevice) {
|
||||||
auto cpuSize = dstTensor->size();
|
auto cpuSize = dstTensor->size();
|
||||||
tempDstStorage = mStaticBufferPool->alloc(cpuSize);
|
tempDstStorage = mStaticBufferPool->alloc(cpuSize);
|
||||||
dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
|
dstPtr = tempDstStorage.ptr();
|
||||||
} else {
|
} else {
|
||||||
dstPtr = (uint8_t*)dstTensor->deviceId();
|
dstPtr = (uint8_t*)dstTensor->deviceId();
|
||||||
}
|
}
|
||||||
|
@ -462,7 +465,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
||||||
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
|
// MNN_PRINT("oncopybuffer dateType:%d->%d format:%d->%d\n", getDataType(srcTensor), getDataType(dstTensor), srcDimensionFormat, dstDimensionFormat);
|
||||||
|
|
||||||
std::unique_ptr<Tensor> wrapTensor;
|
std::unique_ptr<Tensor> wrapTensor;
|
||||||
std::pair<void*, int> wrapSrcStorage;
|
MemChunk wrapSrcStorage;
|
||||||
if (getDataType(srcTensor) != getDataType(dstTensor)) {
|
if (getDataType(srcTensor) != getDataType(dstTensor)) {
|
||||||
auto dimType = Tensor::CAFFE;
|
auto dimType = Tensor::CAFFE;
|
||||||
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
|
switch (TensorUtils::getDescribe(srcTensor)->dimensionFormat) {
|
||||||
|
@ -486,7 +489,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
||||||
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
|
wrapTensor.reset(Tensor::createDevice(srcTensor->shape(), dstTensor->getType(), dimType));
|
||||||
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
|
wrapSrcStorage = mStaticBufferPool->alloc(realSize(wrapTensor.get()) * getBytes(dstTensor));
|
||||||
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
|
// MNN_PRINT("warp:%d %d %d %d\n", realSize(wrapTensor.get()), getBytes(dstTensor), dstTensor->getType(), srcTensor->getDimensionType());
|
||||||
wrapTensor.get()->buffer().device = (uint64_t)((uint8_t*)wrapSrcStorage.first + wrapSrcStorage.second);
|
wrapTensor.get()->buffer().device = (uint64_t)(wrapSrcStorage.ptr());
|
||||||
|
|
||||||
auto dstType = getDataType(dstTensor);
|
auto dstType = getDataType(dstTensor);
|
||||||
if (dstType != DataType_DT_FLOAT) {
|
if (dstType != DataType_DT_FLOAT) {
|
||||||
|
|
|
@ -41,7 +41,7 @@ public:
|
||||||
virtual float onGetMemoryInMB() override;
|
virtual float onGetMemoryInMB() override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
std::shared_ptr<EagerBufferAllocator> mBufferPool;
|
||||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||||
bool mIsCreateError{false};
|
bool mIsCreateError{false};
|
||||||
BackendConfig::PrecisionMode mDefaultPrecision;
|
BackendConfig::PrecisionMode mDefaultPrecision;
|
||||||
|
|
|
@ -118,9 +118,9 @@ ErrorCode ArgMaxExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
||||||
if(mSplitKernel) {
|
if(mSplitKernel) {
|
||||||
mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM;
|
mSecondArgLen = (mDim + ARG_REDUCE_NUM - 1) / ARG_REDUCE_NUM;
|
||||||
auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes);
|
auto buffer_data = pool->alloc(mOutside * mInside * mSecondArgLen * bytes);
|
||||||
mTempDataBuffer = (void*)((uint8_t*)buffer_data.first + buffer_data.second);
|
mTempDataBuffer = (void*)(buffer_data.ptr());
|
||||||
auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t));
|
auto buffer_index = pool->alloc(mOutside * mInside * mSecondArgLen * sizeof(int32_t));
|
||||||
mTempIndexBuffer = (void*)((uint8_t*)buffer_index.first + buffer_index.second);
|
mTempIndexBuffer = (void*)(buffer_index.ptr());
|
||||||
pool->free(buffer_data);
|
pool->free(buffer_data);
|
||||||
pool->free(buffer_index);
|
pool->free(buffer_index);
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ public:
|
||||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::pair<void*, int> mConstBuffer;
|
MemChunk mConstBuffer;
|
||||||
const Op *mOp;
|
const Op *mOp;
|
||||||
int mTotalCount;
|
int mTotalCount;
|
||||||
constBuffer parameters;
|
constBuffer parameters;
|
||||||
|
|
|
@ -155,7 +155,7 @@ ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &input
|
||||||
|
|
||||||
// Alloc temp cuda memory
|
// Alloc temp cuda memory
|
||||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||||
std::pair<void*, size_t> buffer_input, buffer_im2col;
|
MemChunk buffer_input, buffer_im2col;
|
||||||
if(mFp16Fp32MixInfer) {
|
if(mFp16Fp32MixInfer) {
|
||||||
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
||||||
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
|
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
|
||||||
|
|
|
@ -31,12 +31,23 @@ public:
|
||||||
// Do nothing
|
// Do nothing
|
||||||
}
|
}
|
||||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
|
||||||
|
mMaxFuseBufferSize = 0;
|
||||||
|
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(outputs[0]);
|
||||||
|
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||||
if (1 == mLoop->commands()->size()) {
|
if (1 == mLoop->commands()->size()) {
|
||||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||||
auto op = cmd->op();
|
auto op = cmd->op();
|
||||||
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
|
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
|
||||||
auto step = cmd->steps()->data();
|
auto step = cmd->steps()->data();
|
||||||
if (inputs.size() <= 3) {
|
if (inputs.size() <= 3) {
|
||||||
|
if (cmd->fuse() >= 0) {
|
||||||
|
// Make Temp output buffer
|
||||||
|
auto size = cmd->size()->data();
|
||||||
|
mMaxFuseBufferSize = bytes * size[0] * size[2];
|
||||||
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||||
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||||
|
pool->free(buffer);
|
||||||
|
}
|
||||||
auto& unit = mExecutions[0];
|
auto& unit = mExecutions[0];
|
||||||
int as = 1, bs = 1, cs = 1;
|
int as = 1, bs = 1, cs = 1;
|
||||||
if (step[1] == 0) {
|
if (step[1] == 0) {
|
||||||
|
@ -77,11 +88,28 @@ public:
|
||||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||||
auto op = cmd->op();
|
auto op = cmd->op();
|
||||||
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
||||||
|
if (cmd->fuse() >= 0) {
|
||||||
|
// Make Temp output buffer
|
||||||
|
auto size = cmd->size()->data();
|
||||||
|
mMaxFuseBufferSize = mLoop->loopNumber() * bytes * size[0] * size[1] * size[2];
|
||||||
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||||
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||||
|
pool->free(buffer);
|
||||||
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (int i=0; i<mLoop->commands()->size(); ++i) {
|
for (int i=0; i<mLoop->commands()->size(); ++i) {
|
||||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
|
||||||
|
if (cmd->fuse() >= 0) {
|
||||||
|
// Make Temp output buffer
|
||||||
|
auto size = cmd->size()->data();
|
||||||
|
if (cmd->op()->type() == OpType_MatMul) {
|
||||||
|
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
|
||||||
|
} else {
|
||||||
|
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
auto op = cmd->op();
|
auto op = cmd->op();
|
||||||
auto& unit = mExecutions[i];
|
auto& unit = mExecutions[i];
|
||||||
// Find indice and copy to cpu
|
// Find indice and copy to cpu
|
||||||
|
@ -141,6 +169,11 @@ public:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(mMaxFuseBufferSize > 0) {
|
||||||
|
auto buffer = pool->alloc(mMaxFuseBufferSize);
|
||||||
|
mFuseBuffer = (void*)((uint8_t*)buffer.first + buffer.second);
|
||||||
|
pool->free(buffer);
|
||||||
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,9 +194,7 @@ public:
|
||||||
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||||
auto op = cmd->op();
|
auto op = cmd->op();
|
||||||
|
|
||||||
|
if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
|
||||||
|
|
||||||
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
|
|
||||||
Tensor::InsideDescribe::Region reg;
|
Tensor::InsideDescribe::Region reg;
|
||||||
auto srcView = cmd->view()->GetAs<View>(1);
|
auto srcView = cmd->view()->GetAs<View>(1);
|
||||||
auto dstView = cmd->view()->GetAs<View>(0);
|
auto dstView = cmd->view()->GetAs<View>(0);
|
||||||
|
@ -187,14 +218,36 @@ public:
|
||||||
if (index1 >= 0) {
|
if (index1 >= 0) {
|
||||||
srcIndice = (int32_t*)originInputs[index1]->deviceId();
|
srcIndice = (int32_t*)originInputs[index1]->deviceId();
|
||||||
}
|
}
|
||||||
|
auto src = (uint8_t*)(input->deviceId()) + srcView->offset() * bytes;
|
||||||
|
auto dstOrigin = (output->deviceId()) + dstView->offset() * bytes;
|
||||||
|
auto dst = dstOrigin;
|
||||||
|
if(cmd->fuse() >= 0) {
|
||||||
|
dst = (uint64_t)mFuseBuffer;
|
||||||
|
}
|
||||||
BlitWithIndice(
|
BlitWithIndice(
|
||||||
(uint8_t*)(output->deviceId()) + dstView->offset() * bytes,
|
(uint8_t*)dst,
|
||||||
(uint8_t*)(input->deviceId()) + srcView->offset() * bytes,
|
(uint8_t*)src,
|
||||||
dstIndice, srcIndice, index0, index1,
|
dstIndice, srcIndice, index0, index1,
|
||||||
loopNumber, step0, step1, input->elementSize(),
|
loopNumber, step0, step1, input->elementSize(),
|
||||||
reg, bytes, runtime);
|
reg, bytes, runtime);
|
||||||
|
|
||||||
|
|
||||||
|
if(cmd->fuse() >= 0) {
|
||||||
|
auto opType = cmd->fuse();
|
||||||
|
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||||
|
auto srcStride0 = dstStride;
|
||||||
|
auto srcStride1 = dstStride;
|
||||||
|
int32_t tmpSize[3];
|
||||||
|
::memcpy(tmpSize, cmd->size()->data(), 3 * sizeof(int32_t));
|
||||||
|
tmpSize[0] *= loopNumber;
|
||||||
|
auto type = halide_type_of<float>();
|
||||||
|
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||||
|
type.bits = 16;
|
||||||
|
}
|
||||||
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||||
|
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
||||||
|
tmpSize, srcStride0, srcStride1, dstStride, type, runtime, opType);
|
||||||
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -220,12 +273,28 @@ public:
|
||||||
offset = offset * cmd->steps()->data()[v] + view->offset();
|
offset = offset * cmd->steps()->data()[v] + view->offset();
|
||||||
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
|
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto dstOrigin = mStackPtr[cmd->indexes()->data()[0]];
|
||||||
|
auto dst = dstOrigin;
|
||||||
|
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||||
|
|
||||||
|
int fuseOutputStride[3];
|
||||||
|
if(cmd->fuse() >= 0) {
|
||||||
|
dst = (uint64_t)mFuseBuffer;
|
||||||
|
|
||||||
|
dstStride = fuseOutputStride;
|
||||||
|
auto cmdSize = cmd->size()->data();
|
||||||
|
fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
|
||||||
|
fuseOutputStride[1] = cmdSize[2];
|
||||||
|
fuseOutputStride[2] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (OpType_UnaryOp == op->type()) {
|
if (OpType_UnaryOp == op->type()) {
|
||||||
|
|
||||||
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
|
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
|
||||||
auto dst = (float*)mStackPtr[cmd->indexes()->data()[0]];
|
|
||||||
int unaryType = op->main_as_UnaryOp()->opType();
|
int unaryType = op->main_as_UnaryOp()->opType();
|
||||||
|
|
||||||
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
auto srcStride = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|
||||||
UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
|
UnaryBlit((uint8_t*)dst, (const uint8_t*)src, cmd->size()->data(), srcStride, dstStride, bytes, runtime, unaryType);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -234,13 +303,13 @@ public:
|
||||||
if (3 == size) {
|
if (3 == size) {
|
||||||
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
||||||
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
||||||
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
|
unit.outputs[0]->buffer().device = dst;
|
||||||
} else {
|
} else {
|
||||||
MNN_ASSERT(4 == size);
|
MNN_ASSERT(4 == size);
|
||||||
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
unit.inputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[1]];
|
||||||
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
unit.inputs[1]->buffer().device = mStackPtr[cmd->indexes()->data()[2]];
|
||||||
unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
|
unit.inputs[2]->buffer().device = mStackPtr[cmd->indexes()->data()[3]];
|
||||||
unit.outputs[0]->buffer().device = mStackPtr[cmd->indexes()->data()[0]];
|
unit.outputs[0]->buffer().device = dst;
|
||||||
}
|
}
|
||||||
unit.exe->onExecute(unit.inputs, unit.outputs);
|
unit.exe->onExecute(unit.inputs, unit.outputs);
|
||||||
continue;
|
continue;
|
||||||
|
@ -252,16 +321,33 @@ public:
|
||||||
}
|
}
|
||||||
auto src0 = mStackPtr[cmd->indexes()->data()[1]];
|
auto src0 = mStackPtr[cmd->indexes()->data()[1]];
|
||||||
auto src1 = mStackPtr[cmd->indexes()->data()[2]];
|
auto src1 = mStackPtr[cmd->indexes()->data()[2]];
|
||||||
auto dst = mStackPtr[cmd->indexes()->data()[0]];
|
|
||||||
auto opType = op->main_as_BinaryOp()->opType();
|
auto opType = op->main_as_BinaryOp()->opType();
|
||||||
auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
|
auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
|
||||||
auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
|
auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
|
||||||
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
|
||||||
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||||
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
|
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
|
||||||
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
|
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if(cmd->fuse() >= 0) {
|
||||||
|
auto opType = cmd->fuse();
|
||||||
|
auto dstOriginStride = cmd->view()->GetAs<View>(0)->stride()->data();
|
||||||
|
auto type = halide_type_of<float>();
|
||||||
|
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||||
|
type.bits = 16;
|
||||||
|
}
|
||||||
|
// MNN_PRINT("Binary Loop in optype:%d\n", opType);
|
||||||
|
int32_t cmdSize[3];
|
||||||
|
::memcpy(cmdSize, cmd->size()->data(), 3*sizeof(int32_t));
|
||||||
|
if(OpType_MatMul == op->type()) {
|
||||||
|
cmdSize[1] = 1;
|
||||||
|
dstStride = dstOriginStride;
|
||||||
|
}
|
||||||
|
BinaryBlit((uint8_t*)dstOrigin, (uint8_t*)dstOrigin, (const uint8_t*)dst,
|
||||||
|
cmdSize, dstOriginStride, dstStride, dstOriginStride, type, runtime, opType);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
|
@ -274,6 +360,8 @@ private:
|
||||||
std::vector<uint64_t> mStackPtr;
|
std::vector<uint64_t> mStackPtr;
|
||||||
std::map<Tensor*, Tensor*> mIndiceCopy;
|
std::map<Tensor*, Tensor*> mIndiceCopy;
|
||||||
bool mSingleMatMul = false;
|
bool mSingleMatMul = false;
|
||||||
|
int mMaxFuseBufferSize;
|
||||||
|
void* mFuseBuffer;
|
||||||
};
|
};
|
||||||
|
|
||||||
class LoopCreator : public CUDABackend::Creator {
|
class LoopCreator : public CUDABackend::Creator {
|
||||||
|
@ -283,6 +371,13 @@ public:
|
||||||
if (op->main_type() != OpParameter_LoopParam) {
|
if (op->main_type() != OpParameter_LoopParam) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
auto mLoop = op->main_as_LoopParam();
|
||||||
|
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
|
||||||
|
|
||||||
|
if(cmd->fuse() >= 0) {
|
||||||
|
// TODO: support afterwards
|
||||||
|
return nullptr;//
|
||||||
|
}
|
||||||
return new CUDALoop(backend, op->main_as_LoopParam());
|
return new CUDALoop(backend, op->main_as_LoopParam());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -290,4 +385,4 @@ public:
|
||||||
static CUDACreatorRegister<LoopCreator> __init(OpType_While);
|
static CUDACreatorRegister<LoopCreator> __init(OpType_While);
|
||||||
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -848,21 +848,21 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
||||||
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
|
// MNN_PRINT("trAtrB:%d-%d, tmpAB:%d-%d inps:%d, bwlh:%d-%d-%d-%d\n", mTransposeA, mTransposeB, mNeedATempBuffer, mNeedBTempBuffer, inputs.size(), mBatch, mGemmInfo.elh[0], mGemmInfo.elh[1], mGemmInfo.elh[2]);
|
||||||
|
|
||||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||||
std::pair<void*, size_t> bufferAData, bufferBData;
|
MemChunk bufferAData, bufferBData;
|
||||||
size_t convertBytes = 2;
|
size_t convertBytes = 2;
|
||||||
if(mFp32Infer) {
|
if(mFp32Infer) {
|
||||||
convertBytes = 4;
|
convertBytes = 4;
|
||||||
}
|
}
|
||||||
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
|
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedATempBuffer) {
|
||||||
bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
|
bufferAData = pool->alloc(convertBytes * mBatch * mAs * mGemmInfo.elh[0] * mGemmInfo.elhPad[1]);
|
||||||
mTempMatA = (void*)((uint8_t*)bufferAData.first + bufferAData.second);
|
mTempMatA = (void*)bufferAData.ptr();
|
||||||
} else {
|
} else {
|
||||||
mTempMatA = (void *)A->deviceId();
|
mTempMatA = (void *)A->deviceId();
|
||||||
}
|
}
|
||||||
|
|
||||||
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
|
if((mNeedConvertMatAB && mFp16Fp32MixInfer) || mNeedBTempBuffer) {
|
||||||
bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
|
bufferBData = pool->alloc(convertBytes * mBatch * mBs * mGemmInfo.elh[2] * mGemmInfo.elhPad[1]);
|
||||||
mTempMatB = (void*)((uint8_t*)bufferBData.first + bufferBData.second);
|
mTempMatB = (void*)bufferBData.ptr();
|
||||||
} else {
|
} else {
|
||||||
mTempMatB = (void *)B->deviceId();
|
mTempMatB = (void *)B->deviceId();
|
||||||
}
|
}
|
||||||
|
|
|
@ -102,10 +102,10 @@ ErrorCode MultiInputConvDepthWiseExecution::onResize(const std::vector<Tensor *>
|
||||||
// prepare mParams.mFilter and mParams.mBias
|
// prepare mParams.mFilter and mParams.mBias
|
||||||
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
||||||
|
|
||||||
std::pair<void*, int> bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
|
auto bufferFilter = pool->alloc(mParams.numWeightPackTotal * sizeof(half));
|
||||||
mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
|
mParams.mFilter = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
|
||||||
|
|
||||||
std::pair<void*, int> bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
|
auto bufferBias = pool->alloc(mParams.numBiasPackTotal * sizeof(half));
|
||||||
mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
|
mParams.mBias = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
|
||||||
|
|
||||||
pool->free(bufferFilter);
|
pool->free(bufferFilter);
|
||||||
|
|
|
@ -82,19 +82,19 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
|
||||||
elementBytes = 4;
|
elementBytes = 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<void*, int> bufferFilter;
|
MemChunk bufferFilter;
|
||||||
if(mNeedWeightFill) {
|
if(mNeedWeightFill) {
|
||||||
bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]);
|
bufferFilter = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[1] * (size_t)mGemmInfo.elhPad[2]);
|
||||||
mFilterAddr = (void*)((uint8_t*)bufferFilter.first + bufferFilter.second);
|
mFilterAddr = (void*)(bufferFilter.ptr());
|
||||||
} else {
|
} else {
|
||||||
mFilterAddr = (void*)inputs[1]->deviceId();
|
mFilterAddr = (void*)inputs[1]->deviceId();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy Bias
|
// Copy Bias
|
||||||
std::pair<void*, int> bufferBias;
|
MemChunk bufferBias;
|
||||||
if(mNeedBiasFill) {
|
if(mNeedBiasFill) {
|
||||||
bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]);
|
bufferBias = pool->alloc(elementBytes * (size_t)mGemmInfo.elhPad[2]);
|
||||||
mBiasAddr = (void*)((uint8_t*)bufferBias.first + bufferBias.second);
|
mBiasAddr = (void*)(bufferBias.ptr());
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
mBiasAddr = (void*)inputs[2]->deviceId();
|
mBiasAddr = (void*)inputs[2]->deviceId();
|
||||||
|
@ -107,10 +107,10 @@ ErrorCode MultiInputConvExecution::onResize(const std::vector<Tensor*> &inputs,
|
||||||
mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
|
mIm2ColParamter.padX == 0 && mIm2ColParamter.padY == 0);
|
||||||
mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer));
|
mNeedIm2Col = !(mIsConv1x1S1D1P0 && (mFp16Infer || mFp32Infer));
|
||||||
|
|
||||||
std::pair<void*, int> bufferIm2Col;
|
MemChunk bufferIm2Col;
|
||||||
if(mNeedIm2Col) {
|
if(mNeedIm2Col) {
|
||||||
bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
bufferIm2Col = pool->alloc(elementBytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
||||||
mIm2ColBuffer = (void*)((uint8_t*)bufferIm2Col.first + bufferIm2Col.second);
|
mIm2ColBuffer = (void*)(bufferIm2Col.ptr());
|
||||||
}
|
}
|
||||||
|
|
||||||
// free for Reuse
|
// free for Reuse
|
||||||
|
|
|
@ -84,21 +84,21 @@ ErrorCode MultiInputDeconvExecution::onResize(const std::vector<Tensor*> &inputs
|
||||||
|
|
||||||
// Alloc temp cuda memory
|
// Alloc temp cuda memory
|
||||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||||
std::pair<void*, size_t> buffer_input, buffer_im2col;
|
MemChunk buffer_input, buffer_im2col;
|
||||||
if(mFp16Fp32MixInfer) {
|
if(mFp16Fp32MixInfer) {
|
||||||
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
buffer_input = pool->alloc(sizeof(__half) * mGemmInfo.elhPad[1] * mGemmInfo.elh[2]);
|
||||||
mInputBuffer = (void*)((uint8_t*)buffer_input.first + buffer_input.second);
|
mInputBuffer = (void*)buffer_input.ptr();
|
||||||
} else {
|
} else {
|
||||||
mInputBuffer = (void*)input->deviceId();
|
mInputBuffer = (void*)input->deviceId();
|
||||||
}
|
}
|
||||||
buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
|
buffer_im2col = pool->alloc(bytes * mGemmInfo.elh[0] * mGemmInfo.elhPad[2]);
|
||||||
mIm2ColBuffer = (void*)((uint8_t*)buffer_im2col.first + buffer_im2col.second);
|
mIm2ColBuffer = (void*)buffer_im2col.ptr();
|
||||||
|
|
||||||
mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]);
|
mNeedWeightFill = (mGemmInfo.elh[1] != mGemmInfo.elhPad[1]);
|
||||||
std::pair<void*, int> buffer_filter;
|
MemChunk buffer_filter;
|
||||||
if(mNeedWeightFill) {
|
if(mNeedWeightFill) {
|
||||||
buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
buffer_filter = pool->alloc(bytes * (size_t)mGemmInfo.elh[0] * (size_t)mGemmInfo.elhPad[1]);
|
||||||
mFilterAddr = (void*)((uint8_t*)buffer_filter.first + buffer_filter.second);
|
mFilterAddr = (void*)buffer_filter.ptr();
|
||||||
} else {
|
} else {
|
||||||
mFilterAddr = (void*)inputs[1]->deviceId();
|
mFilterAddr = (void*)inputs[1]->deviceId();
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ private:
|
||||||
int mCount;
|
int mCount;
|
||||||
int mChannel;
|
int mChannel;
|
||||||
int mArea;
|
int mArea;
|
||||||
std::pair<void*, int> mPreluStorage;
|
MemChunk mPreluStorage;
|
||||||
bool mIsChannelShared = false;
|
bool mIsChannelShared = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -203,12 +203,14 @@ UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
|
||||||
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
|
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
|
||||||
int count = size[0] * size[1] * size[2];
|
int count = size[0] * size[1] * size[2];
|
||||||
|
|
||||||
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
|
// MNN_PRINT("blit info size:%d-%d-%d, srcStride:%d-%d-%d, dstStride:%d-%d-%d, ptr:%p %p\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2], input, output);
|
||||||
bool isThirdSizeVector = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1);
|
bool isThirdSizeVector = (size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||||
bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
bool isSecondSizeVector = (size[1] % 2 == 0 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||||
bool isFirstSizeVector = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
bool isFirstSizeVector = (size[0] % 2 == 0 && srcStride[0] == 1 && dstStride[0] == 1) && (size[1] == 1 && srcStride[1] == 1 && dstStride[1] == 1) && (size[2] == 1 && srcStride[2] == 1 && dstStride[2] == 1);
|
||||||
|
bool isStrideVector = (srcStride[0] % 2 == 0 || srcStride[0] == 1) && (srcStride[1] % 2 == 0 || srcStride[1] == 1) && (srcStride[2] % 2 == 0 || srcStride[2] == 1) && \
|
||||||
|
(dstStride[0] % 2 == 0 || dstStride[0] == 1) && (dstStride[1] % 2 == 0 || dstStride[1] == 1) && (dstStride[2] % 2 == 0 || dstStride[2] == 1);
|
||||||
bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector;
|
bool isSizeVector = isThirdSizeVector || isSecondSizeVector || isFirstSizeVector;
|
||||||
if(count > 16384 && isSizeVector) {
|
if(count > 16384 && isSizeVector && isStrideVector) {
|
||||||
int32_t newSize[3], newSrcStride[3], newDstStride[3];
|
int32_t newSize[3], newSrcStride[3], newDstStride[3];
|
||||||
newSize[0] = size[0];
|
newSize[0] = size[0];
|
||||||
newSize[1] = size[1];
|
newSize[1] = size[1];
|
||||||
|
|
|
@ -32,7 +32,7 @@ private:
|
||||||
int mCount;
|
int mCount;
|
||||||
int mChannel;
|
int mChannel;
|
||||||
int mArea;
|
int mArea;
|
||||||
std::pair<void*, int> mScaleBiasStorage;
|
MemChunk mScaleBiasStorage;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace CUDA
|
} // namespace CUDA
|
||||||
|
|
|
@ -31,7 +31,7 @@ private:
|
||||||
Tensor mStorage;
|
Tensor mStorage;
|
||||||
bool mNeedUnpackC4;
|
bool mNeedUnpackC4;
|
||||||
ReduceParam mCpuParam;
|
ReduceParam mCpuParam;
|
||||||
std::pair<void*, int> mParam;
|
MemChunk mParam;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace CUDA
|
} // namespace CUDA
|
||||||
|
|
|
@ -235,23 +235,23 @@ ErrorCode TopKV2Execution::onResize(const std::vector<Tensor *> &inputs, const s
|
||||||
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
|
||||||
|
|
||||||
if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
|
if (inputTensor->getType().code == halide_type_int && inputTensor->getType().bits == 32) {
|
||||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||||
pool->free(bufferIndices);
|
pool->free(bufferIndices);
|
||||||
pool->free(bufferValues);
|
pool->free(bufferValues);
|
||||||
} else if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
} else if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
|
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(half));
|
||||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||||
pool->free(bufferIndices);
|
pool->free(bufferIndices);
|
||||||
pool->free(bufferValues);
|
pool->free(bufferValues);
|
||||||
} else {
|
} else {
|
||||||
std::pair<void*, int> bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
auto bufferIndices = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(int));
|
||||||
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
mParams.mBufferIndices = (void*)((uint8_t*)bufferIndices.first + bufferIndices.second);
|
||||||
std::pair<void*, int> bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
|
auto bufferValues = pool->alloc(mParams.mNumBlockTotal * mParams.mNumK * sizeof(float));
|
||||||
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
mParams.mBufferValues = (void*)((uint8_t*)bufferValues.first + bufferValues.second);
|
||||||
pool->free(bufferIndices);
|
pool->free(bufferIndices);
|
||||||
pool->free(bufferValues);
|
pool->free(bufferValues);
|
||||||
|
|
|
@ -41,13 +41,13 @@ protected:
|
||||||
const Op* mOp = nullptr;
|
const Op* mOp = nullptr;
|
||||||
|
|
||||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||||
std::pair<void*, int> mGpuIm2ColParam;
|
MemChunk mGpuIm2ColParam;
|
||||||
|
|
||||||
void* mIm2ColBuffer;
|
void* mIm2ColBuffer;
|
||||||
|
|
||||||
bool mIsConv1x1S1D1P0 = false;
|
bool mIsConv1x1S1D1P0 = false;
|
||||||
bool mNeedIm2Col = true;
|
bool mNeedIm2Col = true;
|
||||||
std::pair<void*, int> mGpuKernelParam;
|
MemChunk mGpuKernelParam;
|
||||||
bool mIsBlock = false;
|
bool mIsBlock = false;
|
||||||
int mBlockNum = 1;
|
int mBlockNum = 1;
|
||||||
|
|
||||||
|
|
|
@ -71,13 +71,13 @@ private:
|
||||||
CutlassGemmInfo mGemmInfo;
|
CutlassGemmInfo mGemmInfo;
|
||||||
|
|
||||||
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
ConvolutionCommon::Im2ColParameter mIm2ColParamter;
|
||||||
std::pair<void*, int> mGpuIm2ColParam;
|
MemChunk mGpuIm2ColParam;
|
||||||
|
|
||||||
void* mIm2ColBuffer;
|
void* mIm2ColBuffer;
|
||||||
|
|
||||||
bool mIsConv1x1S1D1P0 = false;
|
bool mIsConv1x1S1D1P0 = false;
|
||||||
bool mNeedIm2Col = true;
|
bool mNeedIm2Col = true;
|
||||||
std::pair<void*, int> mGpuKernelParam;
|
MemChunk mGpuKernelParam;
|
||||||
bool mIsBlock = false;
|
bool mIsBlock = false;
|
||||||
int mBlockNum = 1;
|
int mBlockNum = 1;
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ private:
|
||||||
int mChannel;
|
int mChannel;
|
||||||
int mCount;
|
int mCount;
|
||||||
int mArea;
|
int mArea;
|
||||||
std::pair<void*, int> mScaleStorage;
|
MemChunk mScaleStorage;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace CUDA
|
} // namespace CUDA
|
||||||
|
|
|
@ -35,7 +35,7 @@ private:
|
||||||
int mChannel;
|
int mChannel;
|
||||||
int mCount;
|
int mCount;
|
||||||
int mArea;
|
int mArea;
|
||||||
std::pair<void*, int> mScaleStorage;
|
MemChunk mScaleStorage;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace CUDA
|
} // namespace CUDA
|
||||||
|
|
|
@ -64,7 +64,7 @@ public:
|
||||||
private:
|
private:
|
||||||
MetalRuntime(void* context);
|
MetalRuntime(void* context);
|
||||||
void* mContext = nullptr;
|
void* mContext = nullptr;
|
||||||
std::shared_ptr<BufferAllocator> mStatic;
|
std::shared_ptr<EagerBufferAllocator> mStatic;
|
||||||
MetalTuneLevel mTuneLevel = Wide;
|
MetalTuneLevel mTuneLevel = Wide;
|
||||||
std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
|
std::map<std::pair<std::string, std::vector<uint32_t>>, std::tuple<std::vector<uint32_t>, std::vector<uint32_t>, uint32_t>> mTunedThreadGroup;
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
class MetalRuntimeAllocator : public BufferAllocator::Allocator {
|
class MetalRuntimeAllocator : public EagerBufferAllocator::Allocator {
|
||||||
public:
|
public:
|
||||||
class MetalBufferAlloc {
|
class MetalBufferAlloc {
|
||||||
public:
|
public:
|
||||||
|
@ -95,8 +95,8 @@ public:
|
||||||
// Do nothing
|
// Do nothing
|
||||||
}
|
}
|
||||||
virtual ~ MetalRuntimeAllocator() = default;
|
virtual ~ MetalRuntimeAllocator() = default;
|
||||||
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override;
|
virtual MemChunk onAlloc(size_t size, size_t align) override;
|
||||||
virtual void onRelease(std::pair<void*, size_t> ptr) override;
|
virtual void onRelease(MemChunk ptr) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
id<MTLDevice> mDevice;
|
id<MTLDevice> mDevice;
|
||||||
|
@ -127,7 +127,7 @@ public:
|
||||||
id<MTLBuffer> getHostBuffer(size_t size) const;
|
id<MTLBuffer> getHostBuffer(size_t size) const;
|
||||||
id<MTLBuffer> getConstBuffer(size_t size) const;
|
id<MTLBuffer> getConstBuffer(size_t size) const;
|
||||||
public:
|
public:
|
||||||
MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime);
|
MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime);
|
||||||
virtual ~MetalBackend();
|
virtual ~MetalBackend();
|
||||||
const MetalRuntime* runtime() const {
|
const MetalRuntime* runtime() const {
|
||||||
return mRuntime;
|
return mRuntime;
|
||||||
|
@ -169,10 +169,10 @@ public:
|
||||||
bool isCommandEncoderSet();
|
bool isCommandEncoderSet();
|
||||||
void setOpEncoder() const;
|
void setOpEncoder() const;
|
||||||
|
|
||||||
BufferAllocator *getBufferPool() const {
|
EagerBufferAllocator *getBufferPool() const {
|
||||||
return mBufferPool.get();
|
return mBufferPool.get();
|
||||||
}
|
}
|
||||||
BufferAllocator *getStaticBufferPool() const {
|
EagerBufferAllocator *getStaticBufferPool() const {
|
||||||
return mStaticBufferPool.get();
|
return mStaticBufferPool.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -190,8 +190,8 @@ private:
|
||||||
|
|
||||||
std::vector<std::function<void(void)>> mOpEncoders;
|
std::vector<std::function<void(void)>> mOpEncoders;
|
||||||
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
|
||||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
std::shared_ptr<EagerBufferAllocator> mBufferPool;
|
||||||
std::shared_ptr<BufferAllocator> mStaticBufferPool;
|
std::shared_ptr<EagerBufferAllocator> mStaticBufferPool;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
mutable id<MTLBuffer> mHostBuffer = nullptr;
|
mutable id<MTLBuffer> mHostBuffer = nullptr;
|
||||||
|
|
|
@ -50,9 +50,9 @@ void MetalBackend::addCreator(OpType t, Creator *c) {
|
||||||
map->insert(std::make_pair(t, c));
|
map->insert(std::make_pair(t, c));
|
||||||
}
|
}
|
||||||
|
|
||||||
MetalBackend::MetalBackend(std::shared_ptr<BufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
|
MetalBackend::MetalBackend(std::shared_ptr<EagerBufferAllocator> staticMem, const MetalRuntime* runtime) : Backend(MNN_FORWARD_METAL) {
|
||||||
mRuntime = runtime;
|
mRuntime = runtime;
|
||||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
|
mBufferPool.reset(new EagerBufferAllocator(EagerBufferAllocator::Allocator::createRecurse(staticMem.get()), 1024));
|
||||||
mStaticBufferPool = staticMem;
|
mStaticBufferPool = staticMem;
|
||||||
mShapeH2D = getConstBuffer(4 * sizeof(int));
|
mShapeH2D = getConstBuffer(4 * sizeof(int));
|
||||||
mShapeD2H = getConstBuffer(4 * sizeof(int));
|
mShapeD2H = getConstBuffer(4 * sizeof(int));
|
||||||
|
@ -67,16 +67,19 @@ void *MetalBackend::context() const {
|
||||||
|
|
||||||
class MetalMemRelease : public Backend::MemObj {
|
class MetalMemRelease : public Backend::MemObj {
|
||||||
public:
|
public:
|
||||||
MetalMemRelease(std::pair<void*, int> buffer, BufferAllocator* allocator) {
|
MetalMemRelease(MemChunk buffer, EagerBufferAllocator* allocator) {
|
||||||
mBuffer = buffer;
|
mBuffer = buffer;
|
||||||
mAllocator = allocator;
|
mAllocator = allocator;
|
||||||
}
|
}
|
||||||
virtual ~ MetalMemRelease() {
|
virtual ~ MetalMemRelease() {
|
||||||
mAllocator->free(mBuffer);
|
mAllocator->free(mBuffer);
|
||||||
}
|
}
|
||||||
|
MemChunk chunk() override {
|
||||||
|
return mBuffer;
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
std::pair<void*, int> mBuffer;
|
MemChunk mBuffer;
|
||||||
BufferAllocator* mAllocator;
|
EagerBufferAllocator* mAllocator;
|
||||||
};
|
};
|
||||||
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType storageType) {
|
||||||
auto tensor = const_cast<Tensor *>(_tensor);
|
auto tensor = const_cast<Tensor *>(_tensor);
|
||||||
|
@ -115,8 +118,8 @@ Backend::MemObj* MetalBackend::onAcquire(const Tensor *_tensor, StorageType stor
|
||||||
}
|
}
|
||||||
|
|
||||||
// reuse if possible
|
// reuse if possible
|
||||||
std::pair<void*, int> buffer;
|
MemChunk buffer;
|
||||||
BufferAllocator* allocator = nullptr;
|
EagerBufferAllocator* allocator = nullptr;
|
||||||
switch (storageType) {
|
switch (storageType) {
|
||||||
case Backend::STATIC: {
|
case Backend::STATIC: {
|
||||||
buffer = mStaticBufferPool->alloc(size, false);
|
buffer = mStaticBufferPool->alloc(size, false);
|
||||||
|
@ -656,8 +659,8 @@ MetalRuntime* MetalRuntime::create(const Backend::Info& info, id<MTLDevice> devi
|
||||||
MetalRuntime::MetalRuntime(void* context) {
|
MetalRuntime::MetalRuntime(void* context) {
|
||||||
mContext = context;
|
mContext = context;
|
||||||
auto ctx = (__bridge MNNMetalContext *)mContext;
|
auto ctx = (__bridge MNNMetalContext *)mContext;
|
||||||
std::shared_ptr<BufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
|
std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
|
||||||
mStatic.reset(new BufferAllocator(allocator));
|
mStatic.reset(new EagerBufferAllocator(allocator));
|
||||||
mTunedInfo = new TunedInfo;
|
mTunedInfo = new TunedInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -859,12 +862,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
|
||||||
return setCache(std::make_pair(buffer, size));
|
return setCache(std::make_pair(buffer, size));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<void*, size_t> MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
|
MemChunk MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
|
||||||
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
|
auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
|
||||||
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
|
auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
|
||||||
return std::make_pair((void *)mMetalBufferAlloc, 0);
|
return MemChunk((void *)mMetalBufferAlloc, 0);
|
||||||
}
|
}
|
||||||
void MetalRuntimeAllocator::onRelease(std::pair<void*, size_t> ptr) {
|
void MetalRuntimeAllocator::onRelease(MemChunk ptr) {
|
||||||
delete (MetalBufferAlloc *)ptr.first;
|
delete (MetalBufferAlloc *)ptr.first;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include "backend/opencl/core/OpenCLBackend.hpp"
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
#include "MNN_generated.h"
|
#include "MNN_generated.h"
|
||||||
|
|
||||||
|
#include "core/BufferAllocator.hpp"
|
||||||
#include "core/TensorUtils.hpp"
|
#include "core/TensorUtils.hpp"
|
||||||
#include "shape/SizeComputer.hpp"
|
#include "shape/SizeComputer.hpp"
|
||||||
#include <map>
|
#include <map>
|
||||||
|
@ -907,25 +908,14 @@ void OpenCLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
|
||||||
#ifdef LOG_VERBOSE
|
#ifdef LOG_VERBOSE
|
||||||
MNN_PRINT("Start onCopyBuffer !\n");
|
MNN_PRINT("Start onCopyBuffer !\n");
|
||||||
#endif
|
#endif
|
||||||
//int8
|
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
||||||
if(srcTensor->getType().code == halide_type_int && srcTensor->getType().bits == 8){
|
copyToDevice(srcTensor, dstTensor);
|
||||||
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
|
||||||
copyToDeviceInt8(srcTensor, dstTensor);
|
copyFromDevice(srcTensor, dstTensor);
|
||||||
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
|
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0){
|
||||||
copyFromDeviceInt8(srcTensor, dstTensor);
|
mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
|
||||||
}else{
|
|
||||||
MNN_PRINT("onCopyBuffer int8 error !!! \n");
|
|
||||||
}
|
|
||||||
}else{
|
}else{
|
||||||
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
|
MNN_PRINT("onCopyBuffer float error !!! \n");
|
||||||
copyToDevice(srcTensor, dstTensor);
|
|
||||||
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0){
|
|
||||||
copyFromDevice(srcTensor, dstTensor);
|
|
||||||
}else if(srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0){
|
|
||||||
mCLRuntime->copyBetweenDevice(srcTensor, dstTensor);
|
|
||||||
}else{
|
|
||||||
MNN_PRINT("onCopyBuffer float error !!! \n");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef LOG_VERBOSE
|
#ifdef LOG_VERBOSE
|
||||||
|
|
|
@ -0,0 +1,150 @@
|
||||||
|
//
|
||||||
|
// ArgMaxBufExecution.cpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#include "backend/opencl/execution/buffer/ArgMaxBufExecution.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
ArgMaxBufExecution::ArgMaxBufExecution(const std::string &compute, Backend* backend, const int axis) : Execution(backend) {
|
||||||
|
mBuildOptions.emplace(compute);
|
||||||
|
mAxis = axis;
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
ErrorCode ArgMaxBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||||
|
auto input = inputs[0];
|
||||||
|
auto output = outputs[0];
|
||||||
|
if(mAxis < 0){
|
||||||
|
mAxis = input->dimensions() + mAxis;
|
||||||
|
}
|
||||||
|
int inside = 1;
|
||||||
|
int outside = 1;
|
||||||
|
for(int i = 0; i < mAxis; ++i){
|
||||||
|
outside *= input->length(i);
|
||||||
|
}
|
||||||
|
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||||
|
inside *= input->length(i);
|
||||||
|
}
|
||||||
|
int dim = input->length(mAxis);
|
||||||
|
|
||||||
|
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||||
|
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||||
|
|
||||||
|
int batch = inputShape.at(0);
|
||||||
|
int inputHeight = inputShape.at(1);
|
||||||
|
int inputWidth = inputShape.at(2);
|
||||||
|
int inputChannels = inputShape.at(3);
|
||||||
|
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||||
|
int outputBatch = outputShape.at(0);
|
||||||
|
int outputHeight = outputShape.at(1);
|
||||||
|
int outputWidth = outputShape.at(2);
|
||||||
|
int outputChannels = outputShape.at(3);
|
||||||
|
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outputWidth),
|
||||||
|
static_cast<uint32_t>(outputHeight),
|
||||||
|
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||||
|
};
|
||||||
|
|
||||||
|
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||||
|
mKernel = runtime->buildKernel("argmax_buf", "argmax_width_buf", mBuildOptions);
|
||||||
|
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||||
|
mKernel = runtime->buildKernel("argmax_buf", "argmax_height_buf", mBuildOptions);
|
||||||
|
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||||
|
if(output->buffer().dimensions == 1){
|
||||||
|
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_dim1_buf", mBuildOptions);
|
||||||
|
}else{
|
||||||
|
mKernel = runtime->buildKernel("argmax_buf", "argmax_channel_buf", mBuildOptions);
|
||||||
|
}
|
||||||
|
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||||
|
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||||
|
mKernel = runtime->buildKernel("argmax_buf", "argmax_batch_buf", mBuildOptions);
|
||||||
|
}
|
||||||
|
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||||
|
|
||||||
|
uint32_t idx = 0;
|
||||||
|
cl_int ret = CL_SUCCESS;
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(input));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(output));
|
||||||
|
ret |= mKernel.setArg(idx++, inputWidth);
|
||||||
|
ret |= mKernel.setArg(idx++, inputHeight);
|
||||||
|
ret |= mKernel.setArg(idx++, inputChannels);
|
||||||
|
ret |= mKernel.setArg(idx++, batch);
|
||||||
|
ret |= mKernel.setArg(idx++, inputChannelBlocks);
|
||||||
|
ret |= mKernel.setArg(idx++, outputWidth);
|
||||||
|
ret |= mKernel.setArg(idx++, outputHeight);
|
||||||
|
ret |= mKernel.setArg(idx++, outputChannels);
|
||||||
|
ret |= mKernel.setArg(idx++, outputChannelBlocks);
|
||||||
|
MNN_CHECK_CL_SUCCESS(ret, "setArg ArgMaxBufExecution");
|
||||||
|
|
||||||
|
std::string kernelName = "gargmax_buf";
|
||||||
|
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode ArgMaxBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("start ArgMaxBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
|
||||||
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
|
cl::Event event;
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
|
|
||||||
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
|
MNN_PRINT("kernel cost:%d us ArgMax\n",costTime);
|
||||||
|
#else
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("end ArgMaxBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
class ArgMaxBufCreator : public OpenCLBackend::Creator {
|
||||||
|
public:
|
||||||
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||||
|
const MNN::Op* op, Backend* backend) const override {
|
||||||
|
for (int i = 0; i < inputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||||
|
}
|
||||||
|
auto inputDimensionFromat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
|
||||||
|
if(inputDimensionFromat == MNN_DATA_FORMAT_NC4HW4){
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
int axis = op->main_as_ArgMax()->axis();
|
||||||
|
if (op->type() == OpType_ArgMax) {
|
||||||
|
return new ArgMaxBufExecution("-DARGMAX", backend, axis);
|
||||||
|
}else{
|
||||||
|
return new ArgMaxBufExecution("", backend, axis);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMaxBuf__(OpType_ArgMax, BUFFER);
|
||||||
|
OpenCLCreatorRegister<ArgMaxBufCreator> __ArgMinBuf__(OpType_ArgMin, BUFFER);
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,43 @@
|
||||||
|
//
|
||||||
|
// ArgMaxBufExecution.hpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#ifndef ArgMaxBufExecution_hpp
|
||||||
|
#define ArgMaxBufExecution_hpp
|
||||||
|
|
||||||
|
#include "core/Execution.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "MNN_generated.h"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
class ArgMaxBufExecution : public Execution {
|
||||||
|
public:
|
||||||
|
ArgMaxBufExecution(const std::string &compute, Backend *backend, const int axis);
|
||||||
|
virtual ~ArgMaxBufExecution() = default;
|
||||||
|
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
cl::Kernel mKernel;
|
||||||
|
uint32_t mMaxWorkGroupSize;
|
||||||
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
|
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||||
|
std::set<std::string> mBuildOptions;
|
||||||
|
int mAxis;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* ArgMaxBufExecution_hpp */
|
||||||
|
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,161 @@
|
||||||
|
//
|
||||||
|
// CastBufExecution.cpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#include "backend/opencl/execution/buffer/CastBufExecution.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
CastBufExecution::CastBufExecution(const std::string& compute, Backend* backend) : Execution(backend) {
|
||||||
|
mBuildOptions.emplace(compute);
|
||||||
|
}
|
||||||
|
ErrorCode CastBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
Tensor* input = inputs[0];
|
||||||
|
Tensor* output = outputs[0];
|
||||||
|
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||||
|
#ifdef MNN_SUPPORT_INTEL_SUBGROUP
|
||||||
|
if (runtime->isSupportedIntelSubgroup()) {
|
||||||
|
return SubgrouponResize(inputs, outputs);
|
||||||
|
}
|
||||||
|
#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
|
||||||
|
mKernel = runtime->buildKernel("cast_buf", "cast_buf", mBuildOptions);
|
||||||
|
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||||
|
|
||||||
|
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||||
|
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||||
|
|
||||||
|
int batch = outputShape.at(0);
|
||||||
|
int outputHeight = outputShape.at(1);
|
||||||
|
int outputWidth = outputShape.at(2);
|
||||||
|
int channels = outputShape.at(3);
|
||||||
|
|
||||||
|
int channelBlocks = (channels + 3) / 4;
|
||||||
|
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outputWidth),
|
||||||
|
static_cast<uint32_t>(outputHeight),
|
||||||
|
static_cast<uint32_t>(batch * channelBlocks),
|
||||||
|
};
|
||||||
|
|
||||||
|
uint32_t idx = 0;
|
||||||
|
cl_int ret = CL_SUCCESS;
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(input));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(output));
|
||||||
|
ret |= mKernel.setArg(idx++, outputWidth);
|
||||||
|
ret |= mKernel.setArg(idx++, outputHeight);
|
||||||
|
ret |= mKernel.setArg(idx++, channelBlocks);
|
||||||
|
MNN_CHECK_CL_SUCCESS(ret, "setArg CastBufExecution");
|
||||||
|
|
||||||
|
std::string kernelName = "cast_buf";
|
||||||
|
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode CastBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("start CastBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
|
||||||
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
|
cl::Event event;
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
|
|
||||||
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
|
MNN_PRINT("kernel cost:%d us Cast\n",costTime);
|
||||||
|
#else
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("end CastBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
static DataType _mapDataType(DataType src) {
|
||||||
|
if (DataType_DT_BOOL == src) {
|
||||||
|
return DataType_DT_INT32;
|
||||||
|
}
|
||||||
|
if (DataType_DT_INT64 == src) {
|
||||||
|
return DataType_DT_INT32;
|
||||||
|
}
|
||||||
|
if (DataType_DT_DOUBLE == src) {
|
||||||
|
return DataType_DT_FLOAT;
|
||||||
|
}
|
||||||
|
return src;
|
||||||
|
}
|
||||||
|
|
||||||
|
class CastBufCreator : public OpenCLBackend::Creator {
|
||||||
|
public:
|
||||||
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||||
|
const MNN::Op* op, Backend* backend) const override {
|
||||||
|
for (int i = 0; i < inputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||||
|
}
|
||||||
|
auto cast = op->main_as_CastParam();
|
||||||
|
// cast param srcT is invalid
|
||||||
|
// auto srcT = _mapDataType(cast->srcT());
|
||||||
|
auto dstT = _mapDataType(cast->dstT());
|
||||||
|
|
||||||
|
const auto &inputDataType = inputs[0]->getType();
|
||||||
|
if (inputDataType.bytes() == 4 && cast->dstT() == MNN::DataType_DT_BOOL) {
|
||||||
|
return new CastBufExecution("-DTO_BOOL", backend);
|
||||||
|
}
|
||||||
|
if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
|
||||||
|
return new CastBufExecution("", backend);
|
||||||
|
}
|
||||||
|
MNN_PRINT("Don't support cast form %d, %d to %d\n", inputDataType.code, inputDataType.bits, cast->dstT());
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
OpenCLCreatorRegister<CastBufCreator> __CastBuf__(OpType_Cast, BUFFER);
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
||||||
|
//
|
||||||
|
// CastBufExecution.hpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#ifndef CastBufExecution_hpp
|
||||||
|
#define CastBufExecution_hpp
|
||||||
|
|
||||||
|
#include "core/Execution.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "MNN_generated.h"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
class CastBufExecution : public Execution {
|
||||||
|
public:
|
||||||
|
CastBufExecution(const std::string &compute, Backend *backend);
|
||||||
|
virtual ~CastBufExecution() = default;
|
||||||
|
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
cl::Kernel mKernel;
|
||||||
|
uint32_t mMaxWorkGroupSize;
|
||||||
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
|
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||||
|
std::set<std::string> mBuildOptions;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* CastBufExecution_hpp */
|
||||||
|
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,110 @@
|
||||||
|
//
|
||||||
|
// RangeBufExecution.cpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#include "backend/opencl/execution/buffer/RangeBufExecution.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
RangeBufExecution::RangeBufExecution(const std::string &compute, Backend* backend) : Execution(backend) {
|
||||||
|
mBuildOptions.emplace(compute);
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
ErrorCode RangeBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||||
|
mKernel = runtime->buildKernel("range_buf", "range_buf", mBuildOptions);
|
||||||
|
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||||
|
|
||||||
|
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
|
||||||
|
|
||||||
|
int batch = outputShape.at(0);
|
||||||
|
int outputHeight = outputShape.at(1);
|
||||||
|
int outputWidth = outputShape.at(2);
|
||||||
|
int channels = outputShape.at(3);
|
||||||
|
int channelBlocks = (channels + 3) / 4;
|
||||||
|
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outputWidth),
|
||||||
|
static_cast<uint32_t>(outputHeight),
|
||||||
|
static_cast<uint32_t>(batch * channelBlocks)
|
||||||
|
};
|
||||||
|
|
||||||
|
uint32_t idx = 0;
|
||||||
|
cl_int ret = CL_SUCCESS;
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
|
||||||
|
ret |= mKernel.setArg(idx++, outputWidth);
|
||||||
|
ret |= mKernel.setArg(idx++, outputHeight);
|
||||||
|
ret |= mKernel.setArg(idx++, channels);
|
||||||
|
ret |= mKernel.setArg(idx++, channelBlocks);
|
||||||
|
MNN_CHECK_CL_SUCCESS(ret, "setArg RangeBufExecution");
|
||||||
|
|
||||||
|
std::string kernelName = "range_buf";
|
||||||
|
mLocalSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode RangeBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("start RangeBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
|
||||||
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
|
cl::Event event;
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
|
|
||||||
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
|
MNN_PRINT("kernel cost:%d us Range\n",costTime);
|
||||||
|
#else
|
||||||
|
run3DKernelDefault(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("end RangeBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
class RangeBufCreator : public OpenCLBackend::Creator {
|
||||||
|
public:
|
||||||
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||||
|
const MNN::Op* op, Backend* backend) const override {
|
||||||
|
for (int i = 0; i < inputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||||
|
}
|
||||||
|
auto code = inputs[0]->getType().code;
|
||||||
|
switch (code) {
|
||||||
|
case halide_type_int:
|
||||||
|
return new RangeBufExecution("-DUSE_INT", backend);
|
||||||
|
case halide_type_float:
|
||||||
|
return new RangeBufExecution("-DUSE_FLOAT", backend);
|
||||||
|
default:
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
OpenCLCreatorRegister<RangeBufCreator> __RangeBuf__(OpType_Range, BUFFER);
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
||||||
|
//
|
||||||
|
// RangeBufExecution.hpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#ifndef RangeBufExecution_hpp
|
||||||
|
#define RangeBufExecution_hpp
|
||||||
|
|
||||||
|
#include "core/Execution.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "MNN_generated.h"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
class RangeBufExecution : public Execution {
|
||||||
|
public:
|
||||||
|
RangeBufExecution(const std::string &compute, Backend *backend);
|
||||||
|
virtual ~RangeBufExecution() = default;
|
||||||
|
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
cl::Kernel mKernel;
|
||||||
|
uint32_t mMaxWorkGroupSize;
|
||||||
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
|
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||||
|
std::set<std::string> mBuildOptions;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* RangeBufExecution_hpp */
|
||||||
|
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -20,12 +20,7 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
|
||||||
MNN_PRINT("start ReductionBufExecution init !\n");
|
MNN_PRINT("start ReductionBufExecution init !\n");
|
||||||
#endif
|
#endif
|
||||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||||
auto reduct = op->main_as_ReductionParam();
|
mAxis = op->main_as_ReductionParam()->dim()->data()[0];
|
||||||
if (nullptr != reduct->dim()) {
|
|
||||||
for (int i = 0; i < reduct->dim()->size(); ++i) {
|
|
||||||
mAxis.push_back(reduct->dim()->data()[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
switch (op->main_as_ReductionParam()->operation()) {
|
switch (op->main_as_ReductionParam()->operation()) {
|
||||||
case ReductionType_MEAN:
|
case ReductionType_MEAN:
|
||||||
mReductType = 0;
|
mReductType = 0;
|
||||||
|
@ -51,44 +46,129 @@ ReductionBufExecution::ReductionBufExecution(const MNN::Op* op, Backend* backend
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ReductionBufExecution::getLocalSize(int size, int maxGroupSize){
|
||||||
|
int local_size = 1;
|
||||||
|
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
|
||||||
|
local_size *= 2;
|
||||||
|
}
|
||||||
|
return local_size;
|
||||||
|
}
|
||||||
|
|
||||||
ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
|
||||||
MNN_ASSERT(mAxis.size() == 1);
|
|
||||||
MNN_ASSERT(mAxis[0] == 1);
|
|
||||||
|
|
||||||
auto runtime = mOpenCLBackend->getOpenCLRuntime();
|
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||||
auto input = inputs[0];
|
auto input = inputs[0];
|
||||||
auto output = outputs[0];
|
auto output = outputs[0];
|
||||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
if(mAxis < 0){
|
||||||
//N=outside H=axis W=inside C=1
|
mAxis = input->dimensions() + mAxis;
|
||||||
|
}
|
||||||
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
int inside = 1;
|
||||||
mLocalWorkSize = {1, 1, 1};
|
int outside = 1;
|
||||||
|
for(int i = 0; i < mAxis; ++i){
|
||||||
|
outside *= input->length(i);
|
||||||
|
}
|
||||||
|
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||||
|
inside *= input->length(i);
|
||||||
|
}
|
||||||
|
int dim = input->length(mAxis);
|
||||||
|
int local_size = 0;
|
||||||
|
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
|
||||||
|
|
||||||
|
if(dim >= 16){
|
||||||
|
mUseLocal = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||||
|
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||||
|
|
||||||
|
int batch = inputShape.at(0);
|
||||||
|
int inputHeight = inputShape.at(1);
|
||||||
|
int inputWidth = inputShape.at(2);
|
||||||
|
int inputChannels = inputShape.at(3);
|
||||||
|
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||||
|
int outputBatch = outputShape.at(0);
|
||||||
|
int outputHeight = outputShape.at(1);
|
||||||
|
int outputWidth = outputShape.at(2);
|
||||||
|
int outputChannels = outputShape.at(3);
|
||||||
|
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||||
|
|
||||||
std::set<std::string> buildOption;
|
std::set<std::string> buildOption;
|
||||||
switch (mReductType) {
|
switch (mReductType) {
|
||||||
case 0:
|
case 0:
|
||||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||||
buildOption.emplace("-DGET_AVG");
|
buildOption.emplace("-DGET_AVG");
|
||||||
|
buildOption.emplace("-DVALUE=0");
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
||||||
|
buildOption.emplace("-DVALUE=-FLT_MAX");
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
||||||
|
buildOption.emplace("-DVALUE=FLT_MAX");
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
||||||
|
buildOption.emplace("-DVALUE=1");
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||||
|
buildOption.emplace("-DVALUE=0");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
MNN_ASSERT(false);
|
MNN_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_buf", buildOption);
|
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outputWidth),
|
||||||
|
static_cast<uint32_t>(outputHeight),
|
||||||
|
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||||
|
};
|
||||||
|
|
||||||
|
if(mUseLocal){
|
||||||
|
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||||
|
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
|
||||||
|
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||||
|
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
|
||||||
|
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||||
|
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
if(output->buffer().dimensions == 1){
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
|
||||||
|
}else{
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
|
||||||
|
}
|
||||||
|
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||||
|
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||||
|
local_size = getLocalSize(batch, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
|
||||||
|
}
|
||||||
|
mGlobalWorkSize[0] *= local_size;
|
||||||
|
}else{
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=0");
|
||||||
|
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_width_buf", buildOption);
|
||||||
|
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_height_buf", buildOption);
|
||||||
|
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||||
|
if(output->buffer().dimensions == 1){
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_dim1_buf", buildOption);
|
||||||
|
}else{
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_channel_buf", buildOption);
|
||||||
|
}
|
||||||
|
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||||
|
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction_buf", "reduct_batch_buf", buildOption);
|
||||||
|
}
|
||||||
|
}
|
||||||
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
|
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
|
||||||
|
|
||||||
mUnits.resize(1);
|
mUnits.resize(1);
|
||||||
|
@ -96,14 +176,27 @@ ErrorCode ReductionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
|
||||||
cl_int ret = CL_SUCCESS;
|
cl_int ret = CL_SUCCESS;
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input));
|
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(input));
|
||||||
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output));
|
ret |= mReduct1DKernel.setArg(idx++, openCLBuffer(output));
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
ret |= mReduct1DKernel.setArg(idx++, inputWidth);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
ret |= mReduct1DKernel.setArg(idx++, inputHeight);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
ret |= mReduct1DKernel.setArg(idx++, inputChannels);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
ret |= mReduct1DKernel.setArg(idx++, batch);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
|
||||||
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");
|
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionBufExecution");
|
||||||
|
|
||||||
|
if(mUseLocal){
|
||||||
|
mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
|
||||||
|
}else{
|
||||||
|
auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
|
||||||
|
std::string kernelName = "reduct_buf";
|
||||||
|
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mReduct1DKernel).first;
|
||||||
|
}
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,12 +207,12 @@ ErrorCode ReductionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
|
||||||
|
|
||||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
cl::Event event;
|
cl::Event event;
|
||||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
||||||
#else
|
#else
|
||||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
||||||
mOpenCLBackend->getOpenCLRuntime());
|
mOpenCLBackend->getOpenCLRuntime());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -140,33 +233,31 @@ public:
|
||||||
for (int i = 0; i < outputs.size(); ++i) {
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
TensorUtils::setTensorSupportPack(outputs[i], false);
|
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||||
}
|
}
|
||||||
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
|
|
||||||
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||||
auto reduct = op->main_as_ReductionParam();
|
auto reduct = op->main_as_ReductionParam();
|
||||||
if (nullptr == reduct->dim()) {
|
if (nullptr == reduct->dim()) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
|
||||||
if(reduct->dim()->size() != 1) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
switch (op->main_as_ReductionParam()->operation()) {
|
|
||||||
case ReductionType_MEAN:
|
|
||||||
break;
|
|
||||||
case ReductionType_MAXIMUM:
|
|
||||||
break;
|
|
||||||
case ReductionType_MINIMUM:
|
|
||||||
break;
|
|
||||||
case ReductionType_PROD:
|
|
||||||
break;
|
|
||||||
case ReductionType_SUM:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
return NULL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return new ReductionBufExecution(op, backend);
|
|
||||||
}
|
}
|
||||||
return NULL;
|
if(reduct->dim()->size() != 1) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
switch (op->main_as_ReductionParam()->operation()) {
|
||||||
|
case ReductionType_MEAN:
|
||||||
|
break;
|
||||||
|
case ReductionType_MAXIMUM:
|
||||||
|
break;
|
||||||
|
case ReductionType_MINIMUM:
|
||||||
|
break;
|
||||||
|
case ReductionType_PROD:
|
||||||
|
break;
|
||||||
|
case ReductionType_SUM:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return new ReductionBufExecution(op, backend);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -30,12 +30,13 @@ public:
|
||||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
private:
|
private:
|
||||||
|
int getLocalSize(int size, int maxGroupSize);
|
||||||
cl::Kernel mReduct1DKernel;
|
cl::Kernel mReduct1DKernel;
|
||||||
std::string mKernelName;
|
std::string mKernelName;
|
||||||
OpenCLBackend *mOpenCLBackend;
|
OpenCLBackend *mOpenCLBackend;
|
||||||
MNN::DataType mdataType;
|
MNN::DataType mdataType;
|
||||||
int mReductType;
|
int mReductType;
|
||||||
std::vector<int> mAxis;
|
int mAxis;
|
||||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
||||||
bool mUseLocal = false;
|
bool mUseLocal = false;
|
||||||
|
|
|
@ -0,0 +1,103 @@
|
||||||
|
//
|
||||||
|
// SelectBufExecution.cpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#include "backend/opencl/execution/buffer/SelectBufExecution.hpp"
|
||||||
|
#include "core/Macro.h"
|
||||||
|
#include "core/TensorUtils.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
SelectBufExecution::SelectBufExecution(Backend* backend) : Execution(backend) {
|
||||||
|
// Do nothing
|
||||||
|
}
|
||||||
|
ErrorCode SelectBufExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
auto inSize1 = inputs[1]->elementSize();
|
||||||
|
auto inSize2 = inputs[2]->elementSize();
|
||||||
|
auto openCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
auto runtime = openCLBackend->getOpenCLRuntime();
|
||||||
|
if(inSize1 == 1)
|
||||||
|
mBuildOptions.emplace("-DINSIZE1_EUQAL_1");
|
||||||
|
if(inSize2 == 1)
|
||||||
|
mBuildOptions.emplace("-DINSIZE2_EUQAL_1");
|
||||||
|
mKernel = runtime->buildKernel("select_buf", "select_buf", mBuildOptions);
|
||||||
|
mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel));
|
||||||
|
|
||||||
|
std::vector<int> outputShape = tensorShapeFormat(outputs[0]);
|
||||||
|
|
||||||
|
int batch = outputShape.at(0);
|
||||||
|
int outputHeight = outputShape.at(1);
|
||||||
|
int outputWidth = outputShape.at(2);
|
||||||
|
int channels = outputShape.at(3);
|
||||||
|
int channelBlocks = (channels + 3) / 4;
|
||||||
|
int outSize = batch * channelBlocks * outputWidth * outputHeight * 4;
|
||||||
|
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outSize),
|
||||||
|
1
|
||||||
|
};
|
||||||
|
|
||||||
|
uint32_t idx = 0;
|
||||||
|
cl_int ret = CL_SUCCESS;
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
|
ret |= mKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[0]));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[1]));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(inputs[2]));
|
||||||
|
ret |= mKernel.setArg(idx++, openCLBuffer(outputs[0]));
|
||||||
|
MNN_CHECK_CL_SUCCESS(ret, "setArg SelectBufExecution");
|
||||||
|
|
||||||
|
std::string kernelName = "select_buf";
|
||||||
|
mLocalSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), kernelName, mKernel).first;
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorCode SelectBufExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("start SelectBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
auto mOpenCLBackend = static_cast<OpenCLBackend*>(backend());
|
||||||
|
|
||||||
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
|
cl::Event event;
|
||||||
|
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
|
|
||||||
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
|
MNN_PRINT("kernel cost:%d us Select\n",costTime);
|
||||||
|
#else
|
||||||
|
runKernel2D(mKernel, mGlobalWorkSize, mLocalSize,
|
||||||
|
mOpenCLBackend->getOpenCLRuntime());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LOG_VERBOSE
|
||||||
|
MNN_PRINT("end SelectBufExecution onExecute...");
|
||||||
|
#endif
|
||||||
|
return NO_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
class SelectBufCreator : public OpenCLBackend::Creator {
|
||||||
|
public:
|
||||||
|
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||||
|
const MNN::Op* op, Backend* backend) const override {
|
||||||
|
for (int i = 0; i < inputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < outputs.size(); ++i) {
|
||||||
|
TensorUtils::setTensorSupportPack(outputs[i], false);
|
||||||
|
}
|
||||||
|
return new SelectBufExecution(backend);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
OpenCLCreatorRegister<SelectBufCreator> __SelectBuf__(OpType_Select, BUFFER);
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -0,0 +1,42 @@
|
||||||
|
//
|
||||||
|
// SelectBufExecution.hpp
|
||||||
|
// MNN
|
||||||
|
//
|
||||||
|
// Created by MNN on 2023/08/11.
|
||||||
|
// Copyright © 2018, Alibaba Group Holding Limited
|
||||||
|
//
|
||||||
|
|
||||||
|
#ifndef MNN_OPENCL_BUFFER_CLOSED
|
||||||
|
#ifndef SelectBufExecution_hpp
|
||||||
|
#define SelectBufExecution_hpp
|
||||||
|
|
||||||
|
#include "core/Execution.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include "MNN_generated.h"
|
||||||
|
#include "backend/opencl/core/OpenCLBackend.hpp"
|
||||||
|
#include "backend/opencl/core/OpenCLRunningUtils.hpp"
|
||||||
|
|
||||||
|
namespace MNN {
|
||||||
|
namespace OpenCL {
|
||||||
|
|
||||||
|
class SelectBufExecution : public Execution {
|
||||||
|
public:
|
||||||
|
SelectBufExecution(Backend *backend);
|
||||||
|
virtual ~SelectBufExecution() = default;
|
||||||
|
|
||||||
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
cl::Kernel mKernel;
|
||||||
|
uint32_t mMaxWorkGroupSize;
|
||||||
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
|
std::vector<uint32_t> mLocalSize = {1, 1, 1};
|
||||||
|
std::set<std::string> mBuildOptions;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace OpenCL
|
||||||
|
} // namespace MNN
|
||||||
|
#endif /* SelectBufExecution_hpp */
|
||||||
|
#endif/* MNN_OPENCL_BUFFER_CLOSED */
|
|
@ -19,7 +19,6 @@ SoftmaxBufExecution::SoftmaxBufExecution(const std::vector<Tensor *> &inputs, in
|
||||||
: Execution(backend) {
|
: Execution(backend) {
|
||||||
mAxis = axis;
|
mAxis = axis;
|
||||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||||
buildSoftmaxKernel();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool SoftmaxBufExecution::buildSoftmaxKernel() {
|
bool SoftmaxBufExecution::buildSoftmaxKernel() {
|
||||||
|
@ -43,10 +42,27 @@ bool SoftmaxBufExecution::buildSoftmaxKernel() {
|
||||||
ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
Tensor *input = inputs[0];
|
Tensor *input = inputs[0];
|
||||||
Tensor *output = outputs[0];
|
Tensor *output = outputs[0];
|
||||||
|
|
||||||
|
const auto dims = input->buffer().dimensions;
|
||||||
|
int inside = 1;
|
||||||
|
int outside = 1;
|
||||||
|
int channel = 1;
|
||||||
|
for (int i = 0; i < mAxis; ++i) {
|
||||||
|
outside *= input->length(i);
|
||||||
|
}
|
||||||
|
channel = input->length(mAxis);
|
||||||
|
for (int i = mAxis + 1; i < dims; ++i) {
|
||||||
|
inside *= input->length(i);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||||
std::vector<int> outputShape = tensorShapeFormat(output);
|
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||||
|
|
||||||
|
const int inputBatch = inputShape.at(0);
|
||||||
|
const int inputHeight = inputShape.at(1);
|
||||||
|
const int inputWidth = inputShape.at(2);
|
||||||
|
const int inputChannels = inputShape.at(3);
|
||||||
|
|
||||||
const int outputBatch = outputShape.at(0);
|
const int outputBatch = outputShape.at(0);
|
||||||
const int outputHeight = outputShape.at(1);
|
const int outputHeight = outputShape.at(1);
|
||||||
const int outputWidth = outputShape.at(2);
|
const int outputWidth = outputShape.at(2);
|
||||||
|
@ -54,9 +70,18 @@ ErrorCode SoftmaxBufExecution::onResize(const std::vector<Tensor *> &inputs, con
|
||||||
|
|
||||||
const int channelBlocks = UP_DIV(outputChannels, 4);
|
const int channelBlocks = UP_DIV(outputChannels, 4);
|
||||||
const int remainChannels = channelBlocks * 4 - outputChannels;
|
const int remainChannels = channelBlocks * 4 - outputChannels;
|
||||||
|
if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
|
||||||
|
mAxis = 1;
|
||||||
|
}else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputHeight){
|
||||||
|
mAxis = 2;
|
||||||
|
}else if(inputBatch * inputChannels * inputHeight == outside && channel == inputWidth && inside == 1){
|
||||||
|
mAxis = 3;
|
||||||
|
}
|
||||||
|
buildSoftmaxKernel();
|
||||||
|
|
||||||
if (mAxis == 1) {
|
if (mAxis == 1) {
|
||||||
mGlobalWorkSize = {static_cast<uint32_t>(channelBlocks), static_cast<uint32_t>(outputWidth),
|
mGlobalWorkSize = {static_cast<uint32_t>(outputWidth),
|
||||||
static_cast<uint32_t>(outputHeight * outputBatch)};
|
static_cast<uint32_t>(outputHeight * outputBatch), 1};
|
||||||
int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
|
int shape[] = {outputBatch, channelBlocks, outputHeight, outputWidth};
|
||||||
|
|
||||||
uint32_t idx = 0;
|
uint32_t idx = 0;
|
||||||
|
@ -132,10 +157,6 @@ class SoftmaxBufCreator : public OpenCLBackend::Creator {
|
||||||
public:
|
public:
|
||||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||||
const MNN::Op *op, Backend *backend) const override {
|
const MNN::Op *op, Backend *backend) const override {
|
||||||
if(inputs[0]->dimensions() == 3 || outputs[0]->dimensions() == 3){
|
|
||||||
MNN_PRINT("softmax not support dimensions == 3 \n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < inputs.size(); ++i) {
|
for (int i = 0; i < inputs.size(); ++i) {
|
||||||
TensorUtils::setTensorSupportPack(inputs[i], false);
|
TensorUtils::setTensorSupportPack(inputs[i], false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,254 @@
|
||||||
|
#ifdef MNN_SUPPORT_FP16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLOBAL_SIZE_3_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void argmax_width_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
|
||||||
|
int4 index = 0;
|
||||||
|
FLOAT4 maxValue = vload4(0, input + offset);
|
||||||
|
for(int i = 1; i < inputWidth; ++i){
|
||||||
|
FLOAT4 value = vload4(i, input + offset);
|
||||||
|
#ifdef ARGMAX
|
||||||
|
index = maxValue < value ? (int4)i : index;
|
||||||
|
maxValue = fmax(maxValue, value);
|
||||||
|
#else
|
||||||
|
index = maxValue > value ? (int4)i : index;
|
||||||
|
maxValue = fmin(maxValue, value);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void argmax_height_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||||
|
int4 index = 0;
|
||||||
|
FLOAT4 maxValue = vload4(0, input + offset);
|
||||||
|
for(int i = 1; i < inputHeight; ++i){
|
||||||
|
FLOAT4 value = vload4(i * inputWidth, input + offset);
|
||||||
|
#ifdef ARGMAX
|
||||||
|
index = maxValue < value ? (int4)i : index;
|
||||||
|
maxValue = fmax(maxValue, value);
|
||||||
|
#else
|
||||||
|
index = maxValue > value ? (int4)i : index;
|
||||||
|
maxValue = fmin(maxValue, value);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void argmax_channel_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int index = 0;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
#ifdef ARGMAX
|
||||||
|
FLOAT maxValue = (FLOAT)-FLT_MAX;
|
||||||
|
#else
|
||||||
|
FLOAT maxValue = (FLOAT)FLT_MAX;
|
||||||
|
#endif
|
||||||
|
FLOAT4 value;
|
||||||
|
FLOAT *valuePtr = (FLOAT*)&value;
|
||||||
|
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||||
|
value = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < 4; ++j){
|
||||||
|
#ifdef ARGMAX
|
||||||
|
if(maxValue < valuePtr[j]){
|
||||||
|
index = i * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if(maxValue > valuePtr[j]){
|
||||||
|
index = i * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
#ifdef ARGMAX
|
||||||
|
if(maxValue < valuePtr[j]){
|
||||||
|
index = (inputChannelBlock - 1) * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if(maxValue > valuePtr[j]){
|
||||||
|
index = (inputChannelBlock - 1) * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
output[outputOffset] = (FLOAT)index;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void argmax_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||||
|
int index = 0;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
#ifdef ARGMAX
|
||||||
|
FLOAT maxValue = (FLOAT)-FLT_MAX;
|
||||||
|
#else
|
||||||
|
FLOAT maxValue = (FLOAT)FLT_MAX;
|
||||||
|
#endif
|
||||||
|
FLOAT4 value;
|
||||||
|
FLOAT *valuePtr = (FLOAT*)&value;
|
||||||
|
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||||
|
value = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < 4; ++j){
|
||||||
|
#ifdef ARGMAX
|
||||||
|
if(maxValue < valuePtr[j]){
|
||||||
|
index = i * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if(maxValue > valuePtr[j]){
|
||||||
|
index = i * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
#ifdef ARGMAX
|
||||||
|
if(maxValue < valuePtr[j]){
|
||||||
|
index = (inputChannelBlock - 1) * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if(maxValue > valuePtr[j]){
|
||||||
|
index = (inputChannelBlock - 1) * 4 + j;
|
||||||
|
maxValue = valuePtr[j];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
output[outputOffset] = (FLOAT)index;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void argmax_batch_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||||
|
|
||||||
|
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int4 index = 0;
|
||||||
|
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||||
|
FLOAT4 maxValue = vload4(0, input + offset);
|
||||||
|
for(int i = 1; i < inputBatch; ++i){
|
||||||
|
FLOAT4 value = vload4(i * batchOffset, input + offset);
|
||||||
|
#ifdef ARGMAX
|
||||||
|
index = maxValue < value ? (int4)i : index;
|
||||||
|
maxValue = fmax(maxValue, value);
|
||||||
|
#else
|
||||||
|
index = maxValue > value ? (int4)i : index;
|
||||||
|
maxValue = fmin(maxValue, value);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
vstore4(CONVERT_FLOAT4(index), 0, output + outputOffset);
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
#ifdef MNN_SUPPORT_FP16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLOBAL_SIZE_3_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void cast_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int width,
|
||||||
|
__private const int height,
|
||||||
|
__private const int channelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / channelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % channelBlock;
|
||||||
|
|
||||||
|
const int inp_offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
|
||||||
|
#ifdef TO_BOOL
|
||||||
|
int4 value = convert_int4(vload4(0, input + inp_offset));
|
||||||
|
value = value == (int4)0 ? (int4)0 : (int4)1;
|
||||||
|
vstore4(CONVERT_FLOAT4(value), 0, output + inp_offset);
|
||||||
|
#else
|
||||||
|
FLOAT4 value = vload4(0, input + inp_offset);
|
||||||
|
vstore4(value, 0, output + inp_offset);
|
||||||
|
#endif
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,40 @@
|
||||||
|
#ifdef MNN_SUPPORT_FP16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLOBAL_SIZE_3_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void range_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input0,
|
||||||
|
__global const FLOAT* input2,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int width,
|
||||||
|
__private const int height,
|
||||||
|
__private const int channel,
|
||||||
|
__private const int channelBlock
|
||||||
|
) {
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / channelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % channelBlock;
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * channelBlock) + channel_idx) * height + height_idx) * width + width_idx)*4;
|
||||||
|
const int channel4 = channel_idx << 2;
|
||||||
|
int index = (((batch_idx * channel) + channel4) * height + height_idx) * width + width_idx;
|
||||||
|
int size = height * width;
|
||||||
|
int4 index4 = (int4)(index, index + size, index + size * 2, index + size * 3);
|
||||||
|
FLOAT start = input0[0];
|
||||||
|
FLOAT step = input2[0];
|
||||||
|
FLOAT4 value = (FLOAT4)start + CONVERT_FLOAT4(index4) * (FLOAT4)step;
|
||||||
|
vstore4(value, 0, output + offset);
|
||||||
|
}
|
|
@ -11,308 +11,285 @@
|
||||||
#define GLOBAL_SIZE_2_DIMS \
|
#define GLOBAL_SIZE_2_DIMS \
|
||||||
__private const int global_size_dim0, __private const int global_size_dim1,
|
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||||
|
|
||||||
|
#define GLOBAL_SIZE_3_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
|
|
||||||
|
__kernel void reduct_width(GLOBAL_SIZE_3_DIMS
|
||||||
__kernel void reduct_general_mean(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
__read_only image2d_t input,
|
||||||
__write_only image2d_t output,
|
__write_only image2d_t output,
|
||||||
__private const int batch,
|
__private const int inputWidth,
|
||||||
__private const int height,
|
__private const int inputHeight,
|
||||||
__private const int width,
|
__private const int inputChannel,
|
||||||
__private const int channel
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
) {
|
) {
|
||||||
const int batch_idx = get_global_id(0);
|
const int width_idx = get_global_id(0);
|
||||||
const int width_idx = get_global_id(1);
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
FLOAT4 sum = 0;
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
for (int h = 0; h < height; h++) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
sum = sum + in;
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
}
|
const int bh = batch_idx*inputHeight+height_idx;
|
||||||
FLOAT* sum_ptr = (FLOAT*)∑
|
const int wc = channel_idx*inputWidth;
|
||||||
for(int i = 1; i < channel; ++i){
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
sum.x += sum_ptr[i];
|
|
||||||
}
|
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x/(height*channel), 0.0, 0.0, 0.0));
|
|
||||||
}
|
|
||||||
__kernel void reduct_general_sum(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(0);
|
|
||||||
const int width_idx = get_global_id(1);
|
|
||||||
|
|
||||||
FLOAT4 sum = 0;
|
|
||||||
for (int h = 0; h < height; h++) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
sum = sum + in;
|
|
||||||
}
|
|
||||||
FLOAT* sum_ptr = (FLOAT*)∑
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
sum.x += sum_ptr[i];
|
|
||||||
}
|
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void reduct_general_max(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(0);
|
|
||||||
const int width_idx = get_global_id(1);
|
|
||||||
|
|
||||||
FLOAT4 sum = (FLOAT4)-MAXFLOAT;
|
|
||||||
for (int h = 0; h < height; h++) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
sum = max(sum, in);
|
|
||||||
}
|
|
||||||
FLOAT* sum_ptr = (FLOAT*)∑
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
sum.x = max(sum.x, sum_ptr[i]);
|
|
||||||
}
|
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void reduct_general_min(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(0);
|
|
||||||
const int width_idx = get_global_id(1);
|
|
||||||
|
|
||||||
FLOAT4 sum = (FLOAT4)MAXFLOAT;
|
|
||||||
for (int h = 0; h < height; h++) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
sum = min(sum, in);
|
|
||||||
}
|
|
||||||
FLOAT* sum_ptr = (FLOAT*)∑
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
sum.x = min(sum.x, sum_ptr[i]);
|
|
||||||
}
|
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void reduct_general_mul(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(0);
|
|
||||||
const int width_idx = get_global_id(1);
|
|
||||||
|
|
||||||
FLOAT4 sum = (FLOAT4)1.0;
|
|
||||||
for (int h = 0; h < height; h++) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
sum = sum * in;
|
|
||||||
}
|
|
||||||
FLOAT* sum_ptr = (FLOAT*)∑
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
sum.x *= sum_ptr[i];
|
|
||||||
}
|
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum.x, 0.0, 0.0, 0.0));
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void reduct_general_mean_local(GLOBAL_SIZE_2_DIMS
|
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(1);
|
|
||||||
const int width_idx = get_global_id(2);
|
|
||||||
|
|
||||||
const int idx = get_local_id(0);
|
#if LOCAL_SIZE > 0
|
||||||
FLOAT local sum[256];
|
const int lid = get_local_id(0);
|
||||||
FLOAT4 out = (FLOAT4)0.0;
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
const int reduce_num = get_local_size(0);
|
for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
|
||||||
for (int h = idx; h < height; h+=reduce_num) {
|
out = OPERATE(out, in);
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
out = out + in;
|
|
||||||
}
|
}
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
sum[lid] = out;
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
out.x += out_ptr[i];
|
|
||||||
}
|
|
||||||
sum[idx] = out.x;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
if (idx < i)
|
if (lid < i)
|
||||||
sum[idx] = sum[idx] + sum[idx + i];
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (idx == 0) {
|
out = sum[0];
|
||||||
|
#else
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0]/(height*channel), 0.0, 0.0, 0.0));
|
for(int i = 0; i < inputWidth; ++i){
|
||||||
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc+i, bh));
|
||||||
|
out = OPERATE(out, in);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputWidth;
|
||||||
|
#endif
|
||||||
|
WI_F(output, (int2)(channel_idx, bh), out);
|
||||||
}
|
}
|
||||||
__kernel void reduct_general_sum_local(GLOBAL_SIZE_2_DIMS
|
|
||||||
|
|
||||||
|
__kernel void reduct_height(GLOBAL_SIZE_3_DIMS
|
||||||
__read_only image2d_t input,
|
__read_only image2d_t input,
|
||||||
__write_only image2d_t output,
|
__write_only image2d_t output,
|
||||||
__private const int batch,
|
__private const int inputWidth,
|
||||||
__private const int height,
|
__private const int inputHeight,
|
||||||
__private const int width,
|
__private const int inputChannel,
|
||||||
__private const int channel
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
) {
|
) {
|
||||||
const int batch_idx = get_global_id(1);
|
#if LOCAL_SIZE > 0
|
||||||
const int width_idx = get_global_id(2);
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
const int idx = get_local_id(0);
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
|
||||||
FLOAT local sum[256];
|
|
||||||
FLOAT4 out = (FLOAT4)0.0;
|
const int width_idx = get_group_id(0);
|
||||||
const int reduce_num = get_local_size(0);
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
for (int h = idx; h < height; h+=reduce_num) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
const int bh = batch_idx*inputHeight;
|
||||||
out = out + in;
|
const int wc = channel_idx*inputWidth+width_idx;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
|
||||||
|
out = OPERATE(out, in);
|
||||||
}
|
}
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
sum[lid] = out;
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
out.x += out_ptr[i];
|
|
||||||
}
|
|
||||||
sum[idx] = out.x;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
if (idx < i)
|
if (lid < i)
|
||||||
sum[idx] = sum[idx] + sum[idx + i];
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (idx == 0) {
|
out = sum[0];
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
#else
|
||||||
|
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
|
||||||
|
const int bh = batch_idx*inputHeight;
|
||||||
|
const int wc = channel_idx*inputWidth+width_idx;
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = 0; i < inputHeight; ++i){
|
||||||
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, bh+i));
|
||||||
|
out = OPERATE(out, in);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputHeight;
|
||||||
|
#endif
|
||||||
|
WI_F(output, (int2)(wc, batch_idx), out);
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void reduct_general_max_local(GLOBAL_SIZE_2_DIMS
|
__kernel void reduct_channel(GLOBAL_SIZE_3_DIMS
|
||||||
__read_only image2d_t input,
|
__read_only image2d_t input,
|
||||||
__write_only image2d_t output,
|
__write_only image2d_t output,
|
||||||
__private const int batch,
|
__private const int inputWidth,
|
||||||
__private const int height,
|
__private const int inputHeight,
|
||||||
__private const int width,
|
__private const int inputChannel,
|
||||||
__private const int channel
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
) {
|
) {
|
||||||
const int batch_idx = get_global_id(1);
|
#if LOCAL_SIZE > 0
|
||||||
const int width_idx = get_global_id(2);
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
const int idx = get_local_id(0);
|
const int batch_idx = get_global_id(2);
|
||||||
FLOAT local sum[256];
|
|
||||||
FLOAT4 out = (FLOAT4)(-MAXFLOAT);
|
|
||||||
const int reduce_num = get_local_size(0);
|
|
||||||
|
|
||||||
for (int h = idx; h < height; h+=reduce_num) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
out = max(out, in);
|
|
||||||
}
|
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
out.x = max(out.x, out_ptr[i]);
|
|
||||||
}
|
|
||||||
sum[idx] = out.x;
|
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
|
|
||||||
|
const int bh = batch_idx*inputHeight+height_idx;
|
||||||
|
const int wc = width_idx;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||||
|
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
out.x = OPERATE(out.x, out.y);
|
||||||
|
out.x = OPERATE(out.x, out.z);
|
||||||
|
out.x = OPERATE(out.x, out.w);
|
||||||
|
sum[lid] = out.x;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
if (idx < i)
|
if (lid < i)
|
||||||
sum[idx] = max(sum[idx], sum[idx + i]);
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (idx == 0) {
|
out.x = sum[0];
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out.x = OPERATE(out.x, inPtr[j]);
|
||||||
}
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out.x = out.x / inputChannel;
|
||||||
|
#endif
|
||||||
|
WI_F(output, (int2)(wc, bh), (FLOAT4)(out.x, 0, 0, 0));
|
||||||
|
|
||||||
|
#else
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||||
|
|
||||||
|
const int bh = batch_idx*inputHeight+height_idx;
|
||||||
|
const int wc = width_idx;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
|
||||||
|
FLOAT out = (FLOAT)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
|
||||||
|
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||||
|
in = RI_F(input, SAMPLER, (int2)(i*inputWidth+wc, bh));
|
||||||
|
for(int j = 0; j < 4; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in = RI_F(input, SAMPLER, (int2)((inputChannelBlock - 1)*inputWidth+wc, bh));
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputChannel;
|
||||||
|
#endif
|
||||||
|
WI_F(output, (int2)(wc, bh), (FLOAT4)(out, 0, 0, 0));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void reduct_general_min_local(GLOBAL_SIZE_2_DIMS
|
__kernel void reduct_batch(GLOBAL_SIZE_3_DIMS
|
||||||
__read_only image2d_t input,
|
__read_only image2d_t input,
|
||||||
__write_only image2d_t output,
|
__write_only image2d_t output,
|
||||||
__private const int batch,
|
__private const int inputWidth,
|
||||||
__private const int height,
|
__private const int inputHeight,
|
||||||
__private const int width,
|
__private const int inputChannel,
|
||||||
__private const int channel
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
) {
|
) {
|
||||||
const int batch_idx = get_global_id(1);
|
#if LOCAL_SIZE > 0
|
||||||
const int width_idx = get_global_id(2);
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
const int idx = get_local_id(0);
|
const int channel_idx = get_global_id(2);
|
||||||
FLOAT local sum[256];
|
|
||||||
FLOAT4 out = (FLOAT4)(MAXFLOAT);
|
|
||||||
|
|
||||||
const int reduce_num = get_local_size(0);
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
for (int h = idx; h < height; h+=reduce_num) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
const int bh = height_idx;
|
||||||
out = min(out, in);
|
const int wc = channel_idx*inputWidth+width_idx;
|
||||||
|
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
|
||||||
|
out = OPERATE(out, in);
|
||||||
}
|
}
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
sum[lid] = out;
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
out.x = min(out.x, out_ptr[i]);
|
|
||||||
}
|
|
||||||
sum[idx] = out.x;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
if (idx < i)
|
if (lid < i)
|
||||||
sum[idx] = min(sum[idx], sum[idx + i]);
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
if (idx == 0) {
|
out = sum[0];
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
#ifdef GET_AVG
|
||||||
}
|
out = out / inputBatch;
|
||||||
}
|
#endif
|
||||||
|
WI_F(output, (int2)(wc, bh), out);
|
||||||
|
#else
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int channel_idx = get_global_id(2);
|
||||||
|
|
||||||
__kernel void reduct_general_mul_local(GLOBAL_SIZE_2_DIMS
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||||
__read_only image2d_t input,
|
|
||||||
__write_only image2d_t output,
|
|
||||||
__private const int batch,
|
|
||||||
__private const int height,
|
|
||||||
__private const int width,
|
|
||||||
__private const int channel
|
|
||||||
) {
|
|
||||||
const int batch_idx = get_global_id(1);
|
|
||||||
const int width_idx = get_global_id(2);
|
|
||||||
|
|
||||||
const int idx = get_local_id(0);
|
|
||||||
FLOAT local sum[256];
|
|
||||||
FLOAT4 out = (FLOAT4)1.0;
|
|
||||||
|
|
||||||
const int reduce_num = get_local_size(0);
|
|
||||||
|
|
||||||
for (int h = idx; h < height; h+=reduce_num) {
|
|
||||||
FLOAT4 in = RI_F(input, SAMPLER, (int2)(width_idx, batch_idx*height+h));
|
|
||||||
out = out * in;
|
|
||||||
}
|
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
|
||||||
for(int i = 1; i < channel; ++i){
|
|
||||||
out.x *= out_ptr[i];
|
|
||||||
}
|
|
||||||
sum[idx] = out.x;
|
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
const int bh = height_idx;
|
||||||
for(int i = reduce_num/2; i > 0; i /= 2){
|
const int wc = channel_idx*inputWidth+width_idx;
|
||||||
if (idx < i)
|
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||||
sum[idx] = sum[idx] * sum[idx + i];
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
for(int i = 0; i < inputBatch; ++i){
|
||||||
}
|
FLOAT4 in = RI_F(input, SAMPLER, (int2)(wc, i*inputHeight+bh));
|
||||||
if (idx == 0) {
|
out = OPERATE(out, in);
|
||||||
WI_F(output, (int2)(width_idx, batch_idx), (FLOAT4)(sum[0], 0.0, 0.0, 0.0));
|
|
||||||
}
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputBatch;
|
||||||
|
#endif
|
||||||
|
WI_F(output, (int2)(wc, bh), out);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -9,31 +9,363 @@
|
||||||
#define GLOBAL_SIZE_2_DIMS \
|
#define GLOBAL_SIZE_2_DIMS \
|
||||||
__private const int global_size_dim0, __private const int global_size_dim1,
|
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||||
|
|
||||||
__kernel void reduct_buf(GLOBAL_SIZE_2_DIMS
|
#define GLOBAL_SIZE_3_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1, __private const int global_size_dim2,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void reduct_width_buf(GLOBAL_SIZE_3_DIMS
|
||||||
__global const FLOAT* input,
|
__global const FLOAT* input,
|
||||||
__global FLOAT* output,
|
__global FLOAT* output,
|
||||||
__private const int batch,
|
__private const int inputWidth,
|
||||||
__private const int height,
|
__private const int inputHeight,
|
||||||
__private const int width,
|
__private const int inputChannel,
|
||||||
__private const int channel
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
) {
|
) {
|
||||||
const int batch_idx = get_global_id(0);
|
const int width_idx = get_global_id(0);
|
||||||
const int width_idx = get_global_id(1);
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
const int inp_offset = ((batch_idx * height + 0) * width + width_idx)*4;
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
FLOAT4 out = vload4(0, input + inp_offset);
|
|
||||||
for (int h = 1; h < height; h++) {
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
FLOAT4 in = vload4(0, input + inp_offset + h*width*4);
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + 0)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + 0)*4;
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
|
||||||
|
#if LOCAL_SIZE > 0
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
|
for(int i = lid; i < inputWidth; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = vload4(i, input + offset);
|
||||||
out = OPERATE(out, in);
|
out = OPERATE(out, in);
|
||||||
}
|
}
|
||||||
FLOAT* out_ptr = (FLOAT*)&out;
|
sum[lid] = out;
|
||||||
for(int c = 1; c < channel; ++c){
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
out.x = OPERATE(out.x, out_ptr[c]);
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
|
if (lid < i)
|
||||||
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
out = sum[0];
|
||||||
#ifdef GET_AVG
|
#else
|
||||||
out.x = out.x / (height * channel);
|
for(int i = 0; i < inputWidth; ++i){
|
||||||
#endif
|
FLOAT4 in = vload4(i, input + offset);
|
||||||
const int out_offset = batch_idx * width + width_idx;
|
out = OPERATE(out, in);
|
||||||
vstore4((FLOAT4)(out.x, 0.0, 0.0, 0.0), out_offset, output);
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputWidth;
|
||||||
|
#endif
|
||||||
|
vstore4(out, 0, output + outputOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void reduct_height_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
#if LOCAL_SIZE > 0
|
||||||
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = lid; i < inputHeight; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = vload4(i * inputWidth, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
sum[lid] = out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
|
if (lid < i)
|
||||||
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
out = sum[0];
|
||||||
|
#else
|
||||||
|
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_channel_idx);
|
||||||
|
|
||||||
|
const int batch_idx = batch_channel_idx / outputChannelBlock;
|
||||||
|
const int channel_idx = batch_channel_idx % outputChannelBlock;
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + channel_idx) * inputHeight + 0) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + channel_idx) * outputHeight + 0) * oututWidth + width_idx)*4;
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = 0; i < inputHeight; ++i){
|
||||||
|
FLOAT4 in = vload4(i * inputWidth, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputHeight;
|
||||||
|
#endif
|
||||||
|
vstore4(out, 0, output + outputOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void reduct_channel_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
#if LOCAL_SIZE > 0
|
||||||
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||||
|
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
out.x = OPERATE(out.x, out.y);
|
||||||
|
out.x = OPERATE(out.x, out.z);
|
||||||
|
out.x = OPERATE(out.x, out.w);
|
||||||
|
sum[lid] = out.x;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
|
if (lid < i)
|
||||||
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
out.x = sum[0];
|
||||||
|
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out.x = OPERATE(out.x, inPtr[j]);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out.x = out.x / inputChannel;
|
||||||
|
#endif
|
||||||
|
output[outputOffset] = out.x;
|
||||||
|
|
||||||
|
#else
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((batch_idx * outputChannelBlock) + 0) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
|
||||||
|
FLOAT out = (FLOAT)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||||
|
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < 4; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputChannel;
|
||||||
|
#endif
|
||||||
|
output[outputOffset] = out;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void reduct_channel_dim1_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
#if LOCAL_SIZE > 0
|
||||||
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, batch_idx);
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
|
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
for(int i = lid; i < inputChannelBlock - 1; i += LOCAL_SIZE){
|
||||||
|
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
out.x = OPERATE(out.x, out.y);
|
||||||
|
out.x = OPERATE(out.x, out.z);
|
||||||
|
out.x = OPERATE(out.x, out.w);
|
||||||
|
sum[lid] = out.x;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
|
if (lid < i)
|
||||||
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
out.x = sum[0];
|
||||||
|
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out.x = OPERATE(out.x, inPtr[j]);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out.x = out.x / inputChannel;
|
||||||
|
#endif
|
||||||
|
output[outputOffset] = out.x;
|
||||||
|
|
||||||
|
#else
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int batch_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, batch_idx);
|
||||||
|
const int offset = ((((batch_idx * inputChannelBlock) + 0) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((batch_idx * outputHeight + height_idx) * oututWidth + width_idx);
|
||||||
|
int remain = inputChannel - (inputChannelBlock - 1) * 4;
|
||||||
|
FLOAT out = (FLOAT)VALUE;
|
||||||
|
FLOAT4 in;
|
||||||
|
FLOAT *inPtr = (FLOAT*)∈
|
||||||
|
for(int i = 0; i < inputChannelBlock - 1; ++i){
|
||||||
|
in = vload4(i * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < 4; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in = vload4((inputChannelBlock - 1) * inputWidth * inputHeight, input + offset);
|
||||||
|
for(int j = 0; j < remain; ++j){
|
||||||
|
out = OPERATE(out, inPtr[j]);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputChannel;
|
||||||
|
#endif
|
||||||
|
output[outputOffset] = out;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void reduct_batch_buf(GLOBAL_SIZE_3_DIMS
|
||||||
|
__global const FLOAT* input,
|
||||||
|
__global FLOAT* output,
|
||||||
|
__private const int inputWidth,
|
||||||
|
__private const int inputHeight,
|
||||||
|
__private const int inputChannel,
|
||||||
|
__private const int inputBatch,
|
||||||
|
__private const int inputChannelBlock,
|
||||||
|
__private const int oututWidth,
|
||||||
|
__private const int outputHeight,
|
||||||
|
__private const int outputChannel,
|
||||||
|
__private const int outputChannelBlock
|
||||||
|
) {
|
||||||
|
#if LOCAL_SIZE > 0
|
||||||
|
const int width_local_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_local_idx, height_idx, channel_idx);
|
||||||
|
const int width_idx = get_group_id(0);
|
||||||
|
|
||||||
|
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||||
|
const int lid = get_local_id(0);
|
||||||
|
FLOAT4 local sum[LOCAL_SIZE];
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = lid; i < inputBatch; i+=LOCAL_SIZE){
|
||||||
|
FLOAT4 in = vload4(i * batchOffset, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
sum[lid] = out;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
for(int i = LOCAL_SIZE/2; i > 0; i /= 2){
|
||||||
|
if (lid < i)
|
||||||
|
sum[lid] = OPERATE(sum[lid], sum[lid + i]);
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
out = sum[0];
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputBatch;
|
||||||
|
#endif
|
||||||
|
vstore4(out, 0, output + outputOffset);
|
||||||
|
#else
|
||||||
|
const int width_idx = get_global_id(0);
|
||||||
|
const int height_idx = get_global_id(1);
|
||||||
|
const int channel_idx = get_global_id(2);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM3(width_idx, height_idx, channel_idx);
|
||||||
|
|
||||||
|
const int offset = ((((0 * inputChannelBlock) + channel_idx) * inputHeight + height_idx) * inputWidth + width_idx)*4;
|
||||||
|
const int outputOffset = ((((0 * outputChannelBlock) + channel_idx) * outputHeight + height_idx) * oututWidth + width_idx)*4;
|
||||||
|
int batchOffset = inputChannelBlock * inputHeight * inputWidth;
|
||||||
|
FLOAT4 out = (FLOAT4)VALUE;
|
||||||
|
for(int i = 0; i < inputBatch; ++i){
|
||||||
|
FLOAT4 in = vload4(i * batchOffset, input + offset);
|
||||||
|
out = OPERATE(out, in);
|
||||||
|
}
|
||||||
|
#ifdef GET_AVG
|
||||||
|
out = out / inputBatch;
|
||||||
|
#endif
|
||||||
|
vstore4(out, 0, output + outputOffset);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
#ifdef MNN_SUPPORT_FP16
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define GLOBAL_SIZE_2_DIMS \
|
||||||
|
__private const int global_size_dim0, __private const int global_size_dim1,
|
||||||
|
|
||||||
|
#define DEAL_NON_UNIFORM_DIM2(input1, input2) \
|
||||||
|
if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
|
||||||
|
return; \
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void select_buf(GLOBAL_SIZE_2_DIMS
|
||||||
|
__global const FLOAT* select,
|
||||||
|
__global const FLOAT* input0,
|
||||||
|
__global const FLOAT* input1,
|
||||||
|
__global FLOAT* output
|
||||||
|
) {
|
||||||
|
const int idx = get_global_id(0);
|
||||||
|
const int idy = get_global_id(1);
|
||||||
|
|
||||||
|
DEAL_NON_UNIFORM_DIM2(idx, idy);
|
||||||
|
if ((int)select[idx]) {
|
||||||
|
#ifdef INSIZE1_EUQAL_1
|
||||||
|
output[idx] = input0[0];
|
||||||
|
#else
|
||||||
|
output[idx] = input0[idx];
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
#ifdef INSIZE2_EUQAL_1
|
||||||
|
output[idx] = input1[0];
|
||||||
|
#else
|
||||||
|
output[idx] = input1[idx];
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,90 +15,76 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
|
||||||
|
|
||||||
|
|
||||||
__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
|
__kernel void softmax_channel(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __write_only image2d_t output, __private const int output_channels,
|
||||||
__private const int remain_channels) {
|
__private const int remain_channels, __private const int4 shape // NCHW
|
||||||
|
) {
|
||||||
|
|
||||||
const int channel_block_idx = get_global_id(0);
|
const int width_idx = get_global_id(0);
|
||||||
const int width_idx = get_global_id(1);
|
const int batch_height_idx = get_global_id(1);
|
||||||
const int batch_height_idx = get_global_id(2);
|
|
||||||
|
|
||||||
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
|
|
||||||
|
if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
|
||||||
|
|
||||||
const int width = global_size_dim1;
|
FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
|
||||||
|
FLOAT4 input_data;
|
||||||
|
for (short i = 0; i < shape.y - 1; ++i) {
|
||||||
|
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
|
||||||
|
float_max_value = max(float_max_value, input_data);
|
||||||
|
}
|
||||||
|
float_max_value.x = max(float_max_value.x, float_max_value.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, float_max_value.z);
|
||||||
|
float_max_value.x = max(float_max_value.x, float_max_value.w);
|
||||||
|
|
||||||
FLOAT float_max_value = -FLT_MAX;
|
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w , batch_height_idx));
|
||||||
FLOAT4 input_data;
|
if (remain_channels == 0) {
|
||||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
float_max_value.x = max(float_max_value.x, input_data.w);
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
} else if (remain_channels == 1) {
|
||||||
float_max_value = max(float_max_value, input_data.w);
|
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
} else if (remain_channels == 2) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
} else if (remain_channels == 3) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FLOAT4 accum_result = 0;
|
||||||
|
for (short i = 0; i < shape.y - 1; ++i) {
|
||||||
|
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * shape.w, batch_height_idx));
|
||||||
|
input_data = EXP(input_data - float_max_value.x);
|
||||||
|
accum_result += input_data;
|
||||||
|
}
|
||||||
|
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
|
||||||
|
|
||||||
|
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (shape.y - 1) * shape.w, batch_height_idx));
|
||||||
|
input_data -= float_max_value.x;
|
||||||
|
if (remain_channels == 0) {
|
||||||
|
accum_result.x += EXP(input_data.w);
|
||||||
|
accum_result.x += EXP(input_data.z);
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 1) {
|
||||||
|
accum_result.x += EXP(input_data.z);
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 2) {
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 3) {
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < shape.y; ++i){
|
||||||
|
int cur_out_width_pos = mad24(i, shape.w, width_idx);
|
||||||
|
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value.x;
|
||||||
|
input_data = EXP(input_data) / accum_result.x;
|
||||||
|
WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1 , batch_height_idx));
|
|
||||||
if (remain_channels == 0) {
|
|
||||||
float_max_value = max(float_max_value, input_data.w);
|
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 1) {
|
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 2) {
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 3) {
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
FLOAT accum_result = 0;
|
|
||||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
|
||||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + i * global_size_dim1, batch_height_idx));
|
|
||||||
input_data = EXP(input_data - float_max_value);
|
|
||||||
accum_result += input_data.x;
|
|
||||||
accum_result += input_data.y;
|
|
||||||
accum_result += input_data.z;
|
|
||||||
accum_result += input_data.w;
|
|
||||||
}
|
|
||||||
|
|
||||||
input_data = RI_F(input, SAMPLER, (int2)(width_idx + (global_size_dim0 - 1) * global_size_dim1, batch_height_idx));
|
|
||||||
input_data -= float_max_value;
|
|
||||||
if (remain_channels == 0) {
|
|
||||||
accum_result += EXP(input_data.w);
|
|
||||||
accum_result += EXP(input_data.z);
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 1) {
|
|
||||||
accum_result += EXP(input_data.z);
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 2) {
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 3) {
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
int cur_out_width_pos = mad24(channel_block_idx, global_size_dim1, width_idx);
|
|
||||||
input_data = RI_F(input, SAMPLER, (int2)(cur_out_width_pos, batch_height_idx)) - float_max_value;
|
|
||||||
const int output_remain = output_channels - mul24(channel_block_idx, 4);
|
|
||||||
|
|
||||||
if (output_remain == 1) {
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else if (output_remain == 2) {
|
|
||||||
input_data.y = EXP(input_data.y) / accum_result;
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else if (output_remain == 3) {
|
|
||||||
input_data.z = EXP(input_data.z) / accum_result;
|
|
||||||
input_data.y = EXP(input_data.y) / accum_result;
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else{
|
|
||||||
input_data = EXP(input_data) / accum_result;
|
|
||||||
}
|
|
||||||
|
|
||||||
WI_F(output, (int2)(cur_out_width_pos, batch_height_idx), input_data);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,
|
__kernel void softmax_height(__read_only image2d_t input, __write_only image2d_t output,
|
||||||
|
|
|
@ -19,87 +19,74 @@ __kernel void softmax_channel(GLOBAL_SIZE_3_DIMS
|
||||||
__private const int remain_channels,
|
__private const int remain_channels,
|
||||||
__private const int4 shape) {//NCHW
|
__private const int4 shape) {//NCHW
|
||||||
|
|
||||||
const int channel_block_idx = get_global_id(0);
|
const int width_idx = get_global_id(0);
|
||||||
const int width_idx = get_global_id(1);
|
const int batch_height_idx = get_global_id(1);
|
||||||
const int batch_height_idx = get_global_id(2);
|
|
||||||
|
|
||||||
DEAL_NON_UNIFORM_DIM3(channel_block_idx, width_idx, batch_height_idx);
|
if (width_idx < shape.w && batch_height_idx < shape.x*shape.z) {
|
||||||
const int batch_idx = batch_height_idx / shape.z;
|
const int batch_idx = batch_height_idx / shape.z;
|
||||||
const int height_idx = batch_height_idx % shape.z;
|
const int height_idx = batch_height_idx % shape.z;
|
||||||
const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;
|
const int offset = (((batch_idx*shape.y+0)*shape.z+height_idx)*shape.w+width_idx)*4;
|
||||||
|
|
||||||
FLOAT float_max_value = -FLT_MAX;
|
FLOAT4 float_max_value = (FLOAT4)-FLT_MAX;
|
||||||
FLOAT4 input_data;
|
FLOAT4 input_data;
|
||||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
for (short i = 0; i < shape.y - 1; ++i) {
|
||||||
input_data = vload4(i*shape.z*shape.w, input+offset);
|
input_data = vload4(i*shape.z*shape.w, input+offset);
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
float_max_value = max(float_max_value, input_data);
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
}
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
|
||||||
float_max_value = max(float_max_value, input_data.w);
|
float_max_value.x = max(float_max_value.x, float_max_value.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, float_max_value.z);
|
||||||
|
float_max_value.x = max(float_max_value.x, float_max_value.w);
|
||||||
|
|
||||||
|
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
|
||||||
|
if (remain_channels == 0) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.w);
|
||||||
|
} else if (remain_channels == 1) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.z);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
} else if (remain_channels == 2) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.y);
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
} else if (remain_channels == 3) {
|
||||||
|
float_max_value.x = max(float_max_value.x, input_data.x);
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT4 accum_result = 0;
|
||||||
|
for (short i = 0; i < shape.y - 1; ++i) {
|
||||||
|
input_data = vload4(i*shape.z*shape.w, input+offset);;
|
||||||
|
input_data = EXP(input_data - float_max_value.x);
|
||||||
|
accum_result += input_data;
|
||||||
|
}
|
||||||
|
accum_result.x = accum_result.x + accum_result.y + accum_result.z + accum_result.w;
|
||||||
|
|
||||||
|
input_data = vload4((shape.y - 1)*shape.z*shape.w, input+offset);
|
||||||
|
input_data -= float_max_value.x;
|
||||||
|
if (remain_channels == 0) {
|
||||||
|
accum_result.x += EXP(input_data.w);
|
||||||
|
accum_result.x += EXP(input_data.z);
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 1) {
|
||||||
|
accum_result.x += EXP(input_data.z);
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 2) {
|
||||||
|
accum_result.x += EXP(input_data.y);
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
} else if (remain_channels == 3) {
|
||||||
|
accum_result.x += EXP(input_data.x);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < shape.y; ++i){
|
||||||
|
input_data = vload4(i*shape.z*shape.w, input+offset) - float_max_value.x;
|
||||||
|
input_data = EXP(input_data) / accum_result.x;
|
||||||
|
vstore4(input_data, i*shape.z*shape.w, output+offset);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
|
|
||||||
if (remain_channels == 0) {
|
|
||||||
float_max_value = max(float_max_value, input_data.w);
|
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 1) {
|
|
||||||
float_max_value = max(float_max_value, input_data.z);
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 2) {
|
|
||||||
float_max_value = max(float_max_value, input_data.y);
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
} else if (remain_channels == 3) {
|
|
||||||
float_max_value = max(float_max_value, input_data.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
FLOAT accum_result = 0;
|
|
||||||
for (short i = 0; i < global_size_dim0 - 1; ++i) {
|
|
||||||
input_data = vload4(i*shape.z*shape.w, input+offset);;
|
|
||||||
input_data = EXP(input_data - float_max_value);
|
|
||||||
accum_result += input_data.x;
|
|
||||||
accum_result += input_data.y;
|
|
||||||
accum_result += input_data.z;
|
|
||||||
accum_result += input_data.w;
|
|
||||||
}
|
|
||||||
|
|
||||||
input_data = vload4((global_size_dim0 - 1)*shape.z*shape.w, input+offset);
|
|
||||||
input_data -= float_max_value;
|
|
||||||
if (remain_channels == 0) {
|
|
||||||
accum_result += EXP(input_data.w);
|
|
||||||
accum_result += EXP(input_data.z);
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 1) {
|
|
||||||
accum_result += EXP(input_data.z);
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 2) {
|
|
||||||
accum_result += EXP(input_data.y);
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
} else if (remain_channels == 3) {
|
|
||||||
accum_result += EXP(input_data.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
input_data = vload4(channel_block_idx*shape.z*shape.w, input+offset) - float_max_value;
|
|
||||||
const int output_remain = output_channels - mul24(channel_block_idx, 4);
|
|
||||||
|
|
||||||
if (output_remain == 1) {
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else if (output_remain == 2) {
|
|
||||||
input_data.y = EXP(input_data.y) / accum_result;
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else if (output_remain == 3) {
|
|
||||||
input_data.z = EXP(input_data.z) / accum_result;
|
|
||||||
input_data.y = EXP(input_data.y) / accum_result;
|
|
||||||
input_data.x = EXP(input_data.x) / accum_result;
|
|
||||||
} else{
|
|
||||||
input_data = EXP(input_data) / accum_result;
|
|
||||||
}
|
|
||||||
|
|
||||||
vstore4(input_data, channel_block_idx*shape.z*shape.w, output+offset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -18,12 +18,7 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
|
||||||
MNN_PRINT("start ReductionExecution init !\n");
|
MNN_PRINT("start ReductionExecution init !\n");
|
||||||
#endif
|
#endif
|
||||||
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||||
auto reduct = op->main_as_ReductionParam();
|
mAxis = op->main_as_ReductionParam()->dim()->data()[0];
|
||||||
if (nullptr != reduct->dim()) {
|
|
||||||
for (int i = 0; i < reduct->dim()->size(); ++i) {
|
|
||||||
mAxis.push_back(reduct->dim()->data()[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
switch (op->main_as_ReductionParam()->operation()) {
|
switch (op->main_as_ReductionParam()->operation()) {
|
||||||
case ReductionType_MEAN:
|
case ReductionType_MEAN:
|
||||||
mReductType = 0;
|
mReductType = 0;
|
||||||
|
@ -49,110 +44,150 @@ ReductionExecution::ReductionExecution(const MNN::Op* op, Backend* backend) : Co
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ReductionExecution::getLocalSize(int size, int maxGroupSize){
|
||||||
|
int local_size = 1;
|
||||||
|
while(local_size * 2 <= maxGroupSize && local_size * 2 <= size){
|
||||||
|
local_size *= 2;
|
||||||
|
}
|
||||||
|
return local_size;
|
||||||
|
}
|
||||||
|
|
||||||
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||||
|
|
||||||
MNN_ASSERT(mAxis.size() == 1);
|
|
||||||
MNN_ASSERT(mAxis[0] == 1);
|
|
||||||
|
|
||||||
auto runtime = mOpenCLBackend->getOpenCLRuntime();
|
auto runtime = mOpenCLBackend->getOpenCLRuntime();
|
||||||
startRecord(runtime, mRecording);
|
startRecord(runtime, mRecording);
|
||||||
auto input = inputs[0];
|
auto input = inputs[0];
|
||||||
auto output = outputs[0];
|
auto output = outputs[0];
|
||||||
std::vector<int> inputShape = tensorShapeFormat(input);
|
if(mAxis < 0){
|
||||||
//N=outside H=axis W=inside C=1
|
mAxis = input->dimensions() + mAxis;
|
||||||
MNN_ASSERT(inputShape[3] == 1);
|
}
|
||||||
if(inputShape[1] >= 256) {
|
int inside = 1;
|
||||||
|
int outside = 1;
|
||||||
|
for(int i = 0; i < mAxis; ++i){
|
||||||
|
outside *= input->length(i);
|
||||||
|
}
|
||||||
|
for(int i = mAxis + 1; i < input->dimensions(); ++i){
|
||||||
|
inside *= input->length(i);
|
||||||
|
}
|
||||||
|
int dim = input->length(mAxis);
|
||||||
|
int local_size = 0;
|
||||||
|
auto MaxWorkItems = runtime->getMaxWorkItemSizes();
|
||||||
|
|
||||||
|
if(dim >= 16){
|
||||||
mUseLocal = true;
|
mUseLocal = true;
|
||||||
}
|
}
|
||||||
if(!mUseLocal) {
|
|
||||||
mGlobalWorkSize = {static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
std::vector<int> inputShape = tensorShapeFormat(input);
|
||||||
mLocalWorkSize = {1, 1, 1};
|
std::vector<int> outputShape = tensorShapeFormat(output);
|
||||||
|
|
||||||
switch (mReductType) {
|
int batch = inputShape.at(0);
|
||||||
case 0:
|
int inputHeight = inputShape.at(1);
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean", {});
|
int inputWidth = inputShape.at(2);
|
||||||
break;
|
int inputChannels = inputShape.at(3);
|
||||||
case 1:
|
int inputChannelBlocks = (inputChannels + 3) / 4;
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max", {});
|
int outputBatch = outputShape.at(0);
|
||||||
break;
|
int outputHeight = outputShape.at(1);
|
||||||
case 2:
|
int outputWidth = outputShape.at(2);
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min", {});
|
int outputChannels = outputShape.at(3);
|
||||||
break;
|
int outputChannelBlocks = (outputChannels + 3) / 4;
|
||||||
case 3:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul", {});
|
std::set<std::string> buildOption;
|
||||||
break;
|
switch (mReductType) {
|
||||||
case 4:
|
case 0:
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum", {});
|
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||||
break;
|
buildOption.emplace("-DGET_AVG");
|
||||||
default:
|
buildOption.emplace("-DVALUE=0");
|
||||||
MNN_ASSERT(false);
|
break;
|
||||||
break;
|
case 1:
|
||||||
|
buildOption.emplace("-DOPERATE(a,b)=max(a,b)");
|
||||||
|
buildOption.emplace("-DVALUE=-FLT_MAX");
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
buildOption.emplace("-DOPERATE(a,b)=min(a,b)");
|
||||||
|
buildOption.emplace("-DVALUE=FLT_MAX");
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
buildOption.emplace("-DOPERATE(a,b)=(a*b)");
|
||||||
|
buildOption.emplace("-DVALUE=1");
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
buildOption.emplace("-DOPERATE(a,b)=(a+b)");
|
||||||
|
buildOption.emplace("-DVALUE=0");
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
MNN_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
mGlobalWorkSize = {
|
||||||
|
static_cast<uint32_t>(outputWidth),
|
||||||
|
static_cast<uint32_t>(outputHeight),
|
||||||
|
static_cast<uint32_t>(outputBatch * outputChannelBlocks)
|
||||||
|
};
|
||||||
|
|
||||||
|
if(mUseLocal){
|
||||||
|
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||||
|
local_size = getLocalSize(inputWidth, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
|
||||||
|
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||||
|
local_size = getLocalSize(inputHeight, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
|
||||||
|
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||||
|
local_size = getLocalSize(inputChannelBlocks - 1, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
|
||||||
|
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||||
|
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||||
|
local_size = getLocalSize(batch, MaxWorkItems[0]);
|
||||||
|
buildOption.emplace("-DLOCAL_SIZE=" + std::to_string(local_size));
|
||||||
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
|
||||||
}
|
}
|
||||||
} else { //useLocal
|
mGlobalWorkSize[0] *= local_size;
|
||||||
uint32_t global_x = 8;
|
}else{
|
||||||
int size = inputShape[1];
|
buildOption.emplace("-DLOCAL_SIZE=0");
|
||||||
if (size >= 1024) {
|
if(batch * inputHeight * inputChannels == outside && 1 == inside && dim == inputWidth){
|
||||||
global_x = 256;
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_width", buildOption);
|
||||||
} else if(size >= 512) {
|
}else if(batch * inputChannels == outside && inputWidth == inside && dim == inputHeight){
|
||||||
global_x = 128;
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_height", buildOption);
|
||||||
} else if (size >= 256) {
|
}else if(batch == outside && inputWidth * inputHeight == inside && dim == inputChannels){
|
||||||
global_x = 64;
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_channel", buildOption);
|
||||||
} else if (size >= 128) {
|
mGlobalWorkSize[2] = static_cast<uint32_t>(outputBatch * outputChannels);
|
||||||
global_x = 32;
|
}else if(1 == outside && inputWidth * inputHeight * inputChannels == inside && dim == batch){
|
||||||
} else if (size >= 64) {
|
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_batch", buildOption);
|
||||||
global_x = 16;
|
|
||||||
} else if (size >= 32) {
|
|
||||||
global_x = 8;
|
|
||||||
}
|
|
||||||
mGlobalWorkSize = {global_x, static_cast<uint32_t>(inputShape[0]), static_cast<uint32_t>(inputShape[2])};
|
|
||||||
mLocalWorkSize = {global_x, 1, 1 };
|
|
||||||
|
|
||||||
switch (mReductType) {
|
|
||||||
case 0:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mean_local", {});
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_max_local", {});
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_min_local", {});
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_mul_local", {});
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
mReduct1DKernel = runtime->buildKernel("reduction", "reduct_general_sum_local", {});
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
MNN_ASSERT(false);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//printf("reduce axis:%d , %d %d %d %d, useLocal:%d\n", mAxis[0], inputShape[0], inputShape[1], inputShape[2], inputShape[3], mUseLocal);
|
|
||||||
|
|
||||||
mUnits.resize(1);
|
mUnits.resize(1);
|
||||||
uint32_t idx = 0;
|
uint32_t idx = 0;
|
||||||
cl_int ret = CL_SUCCESS;
|
cl_int ret = CL_SUCCESS;
|
||||||
if(mUseLocal) {
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[2]);
|
||||||
} else {
|
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[0]);
|
|
||||||
ret |= mReduct1DKernel.setArg(idx++, mGlobalWorkSize[1]);
|
|
||||||
}
|
|
||||||
ret |= mReduct1DKernel.setArg(idx++, openCLImage(input));
|
ret |= mReduct1DKernel.setArg(idx++, openCLImage(input));
|
||||||
ret |= mReduct1DKernel.setArg(idx++, openCLImage(output));
|
ret |= mReduct1DKernel.setArg(idx++, openCLImage(output));
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[0]));
|
ret |= mReduct1DKernel.setArg(idx++, inputWidth);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[1]));
|
ret |= mReduct1DKernel.setArg(idx++, inputHeight);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[2]));
|
ret |= mReduct1DKernel.setArg(idx++, inputChannels);
|
||||||
ret |= mReduct1DKernel.setArg(idx++, static_cast<int32_t>(inputShape[3]));
|
ret |= mReduct1DKernel.setArg(idx++, batch);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, inputChannelBlocks);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputWidth);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputHeight);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputChannels);
|
||||||
|
ret |= mReduct1DKernel.setArg(idx++, outputChannelBlocks);
|
||||||
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution");
|
MNN_CHECK_CL_SUCCESS(ret, "setArg ReductionExecution");
|
||||||
|
|
||||||
if(mUseLocal){
|
if(mUseLocal){
|
||||||
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
mLocalWorkSize = {static_cast<uint32_t>(local_size), 1, 1};
|
||||||
}else{
|
}else{
|
||||||
recordKernel2d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
auto MaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mReduct1DKernel));
|
||||||
|
std::string kernelName = "reduct";
|
||||||
|
mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, MaxWorkGroupSize, runtime, kernelName, mReduct1DKernel).first;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
recordKernel3d(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||||
endRecord(runtime, mRecording);
|
endRecord(runtime, mRecording);
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
|
@ -164,13 +199,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
||||||
|
|
||||||
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
#ifdef ENABLE_OPENCL_TIME_PROFILER
|
||||||
cl::Event event;
|
cl::Event event;
|
||||||
if(mUseLocal) {
|
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime(), &event);
|
||||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
|
||||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
|
||||||
} else {
|
|
||||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
|
||||||
mOpenCLBackend->getOpenCLRuntime(), &event);
|
|
||||||
}
|
|
||||||
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
int costTime = (int)mOpenCLBackend->getOpenCLRuntime()->getCostTime(&event);
|
||||||
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
MNN_PRINT("kernel cost:%d us Reduct1D\n",costTime);
|
||||||
#else
|
#else
|
||||||
|
@ -182,13 +211,7 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
|
||||||
#endif
|
#endif
|
||||||
return NO_ERROR;
|
return NO_ERROR;
|
||||||
}
|
}
|
||||||
if(mUseLocal) {
|
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize, mOpenCLBackend->getOpenCLRuntime());
|
||||||
run3DKernelDefault(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
|
||||||
mOpenCLBackend->getOpenCLRuntime());
|
|
||||||
} else {
|
|
||||||
runKernel2D(mReduct1DKernel, mGlobalWorkSize, mLocalWorkSize,
|
|
||||||
mOpenCLBackend->getOpenCLRuntime());
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LOG_VERBOSE
|
#ifdef LOG_VERBOSE
|
||||||
|
@ -202,32 +225,36 @@ public:
|
||||||
virtual ~ReductionCreator() = default;
|
virtual ~ReductionCreator() = default;
|
||||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
|
||||||
const MNN::Op *op, Backend *backend) const override {
|
const MNN::Op *op, Backend *backend) const override {
|
||||||
if (inputs[0]->getDimensionType() == Tensor::TENSORFLOW) {
|
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
||||||
auto openCLBackend = static_cast<OpenCLBackend *>(backend);
|
auto reduct = op->main_as_ReductionParam();
|
||||||
auto reduct = op->main_as_ReductionParam();
|
if (nullptr == reduct->dim()) {
|
||||||
if (nullptr == reduct->dim()) {
|
return NULL;
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if(reduct->dim()->size() != 1) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
switch (op->main_as_ReductionParam()->operation()) {
|
|
||||||
case ReductionType_MEAN:
|
|
||||||
break;
|
|
||||||
case ReductionType_MAXIMUM:
|
|
||||||
break;
|
|
||||||
case ReductionType_MINIMUM:
|
|
||||||
break;
|
|
||||||
case ReductionType_PROD:
|
|
||||||
break;
|
|
||||||
case ReductionType_SUM:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
return NULL;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return new ReductionExecution(op, backend);
|
|
||||||
}
|
}
|
||||||
|
if(reduct->dim()->size() != 1) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
auto axis = reduct->dim()->data()[0];
|
||||||
|
int dim = inputs[0]->length(axis);
|
||||||
|
std::vector<int> inputShape = tensorShapeFormat(inputs[0]);
|
||||||
|
if(dim == inputShape.at(3) && outputs[0]->buffer().dimensions == 1){
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
switch (op->main_as_ReductionParam()->operation()) {
|
||||||
|
case ReductionType_MEAN:
|
||||||
|
break;
|
||||||
|
case ReductionType_MAXIMUM:
|
||||||
|
break;
|
||||||
|
case ReductionType_MINIMUM:
|
||||||
|
break;
|
||||||
|
case ReductionType_PROD:
|
||||||
|
break;
|
||||||
|
case ReductionType_SUM:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return new ReductionExecution(op, backend);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -28,11 +28,12 @@ public:
|
||||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||||
private:
|
private:
|
||||||
|
int getLocalSize(int size, int maxGroupSize);
|
||||||
cl::Kernel mReduct1DKernel;
|
cl::Kernel mReduct1DKernel;
|
||||||
OpenCLBackend *mOpenCLBackend;
|
OpenCLBackend *mOpenCLBackend;
|
||||||
MNN::DataType mdataType;
|
MNN::DataType mdataType;
|
||||||
int mReductType;
|
int mReductType;
|
||||||
std::vector<int> mAxis;
|
int mAxis;
|
||||||
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
std::vector<uint32_t> mGlobalWorkSize = {1, 1, 1};
|
||||||
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
std::vector<uint32_t> mLocalWorkSize{1, 1, 1};
|
||||||
bool mUseLocal = false;
|
bool mUseLocal = false;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue