[Sync] Sync Internal 2.1.2

This commit is contained in:
xiaying 2022-09-30 10:02:52 +08:00
parent ce9c57e6b5
commit db53f951e6
140 changed files with 12507 additions and 3196 deletions

View File

@ -186,6 +186,13 @@ option(MNN_AVX512 "Enable AVX512" OFF)
option(MNN_CUDA "Enable CUDA" OFF)
option(MNN_TENSORRT "Enable TensorRT" OFF)
option(MNN_COREML "Enable CoreML" OFF)
option(MNN_NNAPI "Enable NNAPI" OFF)
option(MNN_CUDA_PROFILE "Enable CUDA profile" OFF)
if (NOT MNN_CUDA OR NOT CMAKE_SYSTEM_NAME MATCHES "^Linux")
set(MNN_CUDA_PROFILE OFF)
endif()
if (MNN_USE_THREAD_POOL)
message(STATUS "Use Threadpool, forbid openmp")
@ -223,12 +230,14 @@ message(STATUS "\tARM82: ${MNN_ARM82}")
message(STATUS "\toneDNN: ${MNN_ONEDNN}")
message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
message(STATUS "\tCoreML: ${MNN_COREML}")
message(STATUS "\tNNAPI: ${MNN_NNAPI}")
message(STATUS "\tCUDA: ${MNN_CUDA}")
message(STATUS "\tOpenMP: ${MNN_OPENMP}")
message(STATUS "\tBF16: ${MNN_SUPPORT_BF16}")
message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
message(STATUS "\tHidden: ${MNN_HIDDEN}")
message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
message(STATUS "\tCUDA PROFILE: ${MNN_CUDA_PROFILE}")
if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
add_definitions(-fPIC)
@ -480,6 +489,13 @@ IF(MNN_COREML)
list(APPEND MNN_EXTRA_DEPENDS ${VIDEO})
ENDIF()
# NNAPI
IF(MNN_NNAPI)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/nnapi/)
list(APPEND MNN_DEPS MNN_NNAPI)
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_NNAPI>)
ENDIF()
# Vulkan
IF(MNN_VULKAN)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/vulkan/)

View File

@ -1,12 +1,31 @@
set (EXTRA_INCLUDE "")
set (EXTRA_LIBS "")
set (EXTRA_OPTIONS "")
if(MNN_CUDA_PROFILE)
set(CUDA_MIN_VERSION "9.0")
find_package(CUDA ${CUDA_MIN_VERSION})
MESSAGE("benchmark message:CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}")
link_directories(${CUDA_INCLUDE_DIRS}/../lib/
${CUDA_INCLUDE_DIRS}/../lib64/)
set(EXTRA_INCLUDE ${CUDA_INCLUDE_DIRS})
set(EXTRA_LIBS -lnvToolsExt)
set(EXTRA_OPTIONS -DMNN_CUDA_PROFILE)
endif()
add_executable(benchmark.out ${CMAKE_CURRENT_LIST_DIR}/benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/revertMNNModel.cpp)
target_include_directories(benchmark.out PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/ ${CMAKE_CURRENT_SOURCE_DIR}/tools/)
target_link_libraries(benchmark.out ${MNN_DEPS})
target_include_directories(benchmark.out PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/ ${CMAKE_CURRENT_SOURCE_DIR}/tools/ ${EXTRA_INCLUDE})
target_compile_options(benchmark.out PRIVATE ${EXTRA_OPTIONS})
target_link_libraries(benchmark.out ${MNN_DEPS} ${EXTRA_LIBS})
file(GLOB_RECURSE SRC_FILES ${CMAKE_CURRENT_LIST_DIR}/exprModels/*.cpp)
add_executable(benchmarkExprModels.out ${CMAKE_CURRENT_LIST_DIR}/benchmarkExprModels.cpp ${SRC_FILES})
target_include_directories(benchmarkExprModels.out PRIVATE "${CMAKE_CURRENT_LIST_DIR}/exprModels" ${CMAKE_CURRENT_SOURCE_DIR}/)
target_link_libraries(benchmarkExprModels.out ${MNN_DEPS})
if (MSVC AND NOT MNN_BUILD_SHARED_LIBS)
foreach (DEPEND ${MNN_DEPS})
target_link_options(benchmark.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)

View File

@ -141,6 +141,7 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
// std::vector<int> dims{1, 3, 224, 224};
// net->resizeTensor(input, dims);
// net->resizeSession(session);
net->releaseModel();
const MNN::Backend* inBackend = net->getBackend(session, input);
@ -162,12 +163,9 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
for (int round = 0; round < loop; round++) {
auto timeBegin = getTimeInUs();
void* host = input->map(MNN::Tensor::MAP_TENSOR_WRITE, input->getDimensionType());
input->unmap(MNN::Tensor::MAP_TENSOR_WRITE, input->getDimensionType(), host);
net->runSession(session);
host = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType());
outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), host);
auto timeEnd = getTimeInUs();

3
docs/_static/style.css vendored Normal file
View File

@ -0,0 +1,3 @@
.wy-nav-content {
max-width: 75% !important;
}

View File

@ -42,8 +42,10 @@ MNN使用CMake构建项目CMake中的宏定义列表如下
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
| MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` |
| MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` |
| MNN_CUDA_PROFILE | 是否打开CUDA profile工具默认为`OFF` |
| MNN_TENSORRT | 是否构建`TensorRT`后端,默认为`OFF` |
| MNN_COREML | 是否构建`CoreML`后端,默认为`OFF` |
| MNN_NNAPI | 是否构建`NNAPI`后端,默认为`OFF` |
| MNN_BUILD_BENCHMARK | 是否构建MNN的性能测试默认为`OFF` |
| MNN_BUILD_TEST | 是否构建MNN的单源测试默认为`OFF` |
| MNN_BUILD_FOR_ANDROID_COMMAND | 是否使用命令行构建`Android`,默认为`OFF` |

View File

@ -67,5 +67,8 @@ html_theme = 'sphinx_rtd_theme'
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = [
'style.css',
]
needs_sphinx = '4.2.0'

View File

@ -187,7 +187,7 @@ ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, void* dest,
- `oh` 输出高度
- `outputBpp` 如果是0设置为保存和config.destFormat默认为0
- `outputStride` 如果为0设置为ow * outputBpp默认为0
- `type` 支持halide_type_of和halide_type_of默认为halide_type_of<float>
- `type` 支持`halide_type_of<uint8_t>``halide_type_of<float>`,默认为`halide_type_of<float>`
返回结果code

View File

@ -2,7 +2,7 @@
## 概念说明
`Module`接口可以用于模型训练与模型推理
- 模型训练时用户可以继承`Module`类增加自己的实现用来训练;
- 模型推理与`Session`的区别是不需要用户显示resize支持控制流所以当模型中有`if``while`时必须使用`Module`推理
- 模型推理与`Session`的区别是不需要用户显示resize支持控制流所以当模型中有`if``while`时必须使用`Module`推理
### 相关数据结构
- `Module` Module接口的核心类表示一个模型的虚类实际加载模型时会创建其子类
- `Executor` 包含若干个`RuntimeManager`,提供内存管理接口,每个`Executor`必须在单线程环境下运行。默认提供全局 `Executor`,需要并发执行时,可自行创建。
@ -10,7 +10,7 @@
- `VARP` 作为`Module`的输入输出,也是[Expr API](expr.md)中的基础数据结构
## 工作流程
![workflow](../_static/images/inference/Matrix.png)
创建Executor(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module -> 销毁Executor(可选)
### 创建Executor
`Executor`给用户提供接口来配置推理后端、线程数等属性,以及做性能统计、算子执行的回调函数、内存回收等功能。 提供一个全局的Exector对象用户不用创建或持有对象即可直接使用。
```cpp
@ -58,7 +58,10 @@ struct Info {
const Info* getInfo() const;
```
### 执行推理
调用`onForward`执行推理
调用`onForward`执行推理。
**注意:当`Module`析构之后使用`onForward`返回的`VARP`将不可用**
```cpp
std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs);
```

View File

@ -1,5 +1,14 @@
# Python API使用
## 安装
MNN Python API可以使用[源码安装](../compile/pymnn.md),也可以直接使用`pip`安装预编译whl包`pip`安装用法如下:
```bash
# 外部版本安装
pip install MNN==$version
# 公司内部版本安装
pip install -i https://artifacts.antgroup-inc.cn/simple/ MNN-Internal==$version
```
## 概览
MNN在C++的基础上增加了Python扩展。扩展单元包括两个部分
- MNN负责推理训练图像处理和数值计算

View File

@ -545,6 +545,8 @@ const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session
在只有一个输出tensor时可以在调用`getSessionOutput`时传入NULL以获取tensor。
**注意:当`Session`析构之后使用`getSessionOutput`获取的`Tensor`将不可用**
### 拷贝数据
**不熟悉MNN源码的用户必须使用这种方式获取输出**
NCHW (适用于 Caffe / TorchScript / Onnx 转换而来的模型)示例:

View File

@ -62,7 +62,7 @@ Tensor是MNN V2接口中的基础数据结构是最基本的数据封装类
### `Tensor(shape, dtype, value_list, dimension)`
创建一个指定形状,数据类型, 数据和数据排布的Tensor, 数据拷贝自`value_list`
能够将`list``tuple``bytes``ndarray``PyCapsule`等格式的数据转换成`Tensor`
能够将`list``tuple``bytes``ndarray``PyCapsule``int指针`等格式的数据转换成`Tensor`
*注意:`value_list`仅在PYMNN_NUMPY_USABLE打开的情况下支持`ndarray`,移动端默认关闭*
@ -71,7 +71,7 @@ Tensor是MNN V2接口中的基础数据结构是最基本的数据封装类
参数:
- `shape:tuple` Tensor形状
- `dtype:MNN.Halide_Type_*` Tensor数据类型
- `value_list:ndarray/tuple/list/bytes/PyCapsule` 数据
- `value_list:ndarray/tuple/list/bytes/PyCapsule/int_addr` 数据
- `dimension:MNN.Tensor_DimensionType_*` 数据排布格式
---
@ -176,13 +176,22 @@ Tensor是MNN V2接口中的基础数据结构是最基本的数据封装类
### `Example`
```python
import numpy as _np
import MNN
import MNN.numpy as np
data = _np.array([1., 2., 3.], dtype=_np.float32)
# 创建Tensor
# 通过给定的tuple创建Tensor, 参数分别为:形状,数据类型,数据,数据排布格式
t1 = MNN.Tensor((1, 3), MNN.Halide_Type_Float, (1., 2., 3.), MNN.Tensor_DimensionType_Caffe)
# 通过Var创建Tensor
t2 = MNN.Tensor(np.array([1., 2., 3.])) # 与t1等价
# 通过ndarray创建Tensor
t3 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data, MNN.Tensor_DimensionType_Caffe)
# 通过bytes创建Tensor
t4 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data.tobytes(), MNN.Tensor_DimensionType_Caffe)
# 通过int类型的内存指针创建Tensor使用该方法比直接用ndarray速度快但是要求ndarray的内存必须连续
t5 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data.__array_interface__['data'][0], MNN.Tensor_DimensionType_Caffe)
print(t1.getShape()) # (1, 3)
print(t1.getDataType()) # <capsule object NULL at 0x7fe01e74ff30>
print(t1.getDimensionType()) # 1

View File

@ -18,12 +18,12 @@ expr是MNN的表达式模块包含了一系列的表达式函数能够构造M
---
### `const(value_list, shape, data_format, dtype)`
根据输入数据创建一个`Const`类型的`Var`;该函数是创建的`Var`的最基本函数,
能够将`list``tuple``bytes``ndarray``PyCapsule`等格式的数据转换成`Var`
能够将`list``tuple``bytes``ndarray``PyCapsule``int指针`等格式的数据转换成`Var`
*注意:`value_list`仅在PYMNN_NUMPY_USABLE打开的情况下支持`ndarray`,移动端默认关闭*
参数:
- `value_list:ndarray/list/tuple/bytes/PyCapsule` 输入数据
- `value_list:ndarray/list/tuple/bytes/PyCapsule/int_addr` 输入数据
- `shape:[int]` 构造`Var`的形状
- `data_format:data_format` 数据排布格式,参考[data_format](Var.html#data-format)
- `dtype:dtype` 数据类型,参考[dtype](Var.html#dtype)
@ -44,6 +44,8 @@ array([2, 3, 4], dtype=int32)
array([97, 98, 99], dtype=uint8)
>>> expr.const(MNN.Tensor([2, 3]).getData(), [2], expr.NCHW, expr.int) # PyCapsule
array([2, 3], dtype=int32)
>>> expr.const(np.arange(4.0).astype(np.float32).__array_interface__['data'][0], [4], expr.NCHW, expr.float) # int_addr 该方法要求ndarray内存必须连续
array([0., 1., 2., 3.], dtype=float32)
```
---
### `set_thread_number(numberThread)`
@ -1336,6 +1338,94 @@ array(0)
array([0, 1])
```
---
### `eltwise_prod(x, y, coeff)`
逐元素对输入的变量执行乘法运算
参数:
- `x:Var_like` 输入变量
- `y:Var_like` 输入变量
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
返回:`x*y`, 当`coeff=[1.,0.]`时返回`x`
返回类型:`Var`
示例:
```python
>>> expr.eltwise_prod([1., 2., 3.], [2., 2., 2.], [])
array([2., 4., 6.], dtype=float32)
>>> expr.eltwise_prod([1., 2., 3.], [2., 2., 2.], [1., 0.])
array([1., 2., 3.], dtype=float32)
```
---
### `eltwise_sum(x, y, coeff)`
逐元素对输入的变量执行加法运算
参数:
- `x:Var_like` 输入变量
- `y:Var_like` 输入变量
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
返回:`x+y`, 当`coeff=[1.,0.]`时返回`x`
返回类型:`Var`
示例:
```python
>>> expr.eltwise_sum([1., 2., 3.], [2., 2., 2.], [])
array([3., 4., 5.], dtype=float32)
>>> expr.eltwise_sum([1., 2., 3.], [2., 2., 2.], [1., 0.])
array([1., 2., 3.], dtype=float32)
```
---
### `eltwise_sub(x, y, coeff)`
逐元素对输入的变量执行减法运算
参数:
- `x:Var_like` 输入变量
- `y:Var_like` 输入变量
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
返回:`x-y`, 当`coeff=[1.,0.]`时返回`x`
返回类型:`Var`
示例:
```python
>>> expr.eltwise_sub([1., 2., 3.], [2., 2., 2.], [])
array([-1., 0., 1.], dtype=float32)
>>> expr.eltwise_sub([1., 2., 3.], [2., 2., 2.], [1., 0.])
array([1., 2., 3.], dtype=float32)
```
---
### `eltwise_max(x, y, coeff)`
逐元素对输入的变量执行比较运算,取最大值
参数:
- `x:Var_like` 输入变量
- `y:Var_like` 输入变量
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
返回:`max(x,y)`, 当`coeff=[1.,0.]`时返回`x`
返回类型:`Var`
示例:
```python
>>> expr.eltwise_max([1., 2., 3.], [2., 2., 2.], [])
array([2., 2., 3.], dtype=float32)
>>> expr.eltwise_max([1., 2., 3.], [2., 2., 2.], [1., 0.])
array([1., 2., 3.], dtype=float32)
```
---
### `cast(x, dtype=_F.float)`
返回输入数的dtype

View File

@ -1,3 +1,17 @@
set (EXTRA_LIBS "")
if(MNN_CUDA_PROFILE)
set(CUDA_MIN_VERSION "9.0")
find_package(CUDA ${CUDA_MIN_VERSION})
MESSAGE("cuda dir is:${CUDA_INCLUDE_DIRS}")
include_directories(
${CUDA_INCLUDE_DIRS}
)
set(EXTRA_LIBS ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
endif()
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
@ -17,7 +31,7 @@ IF(MNN_SEP_BUILD)
else()
add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
endif()
target_link_libraries(MNN_Express MNN)
target_link_libraries(MNN_Express MNN ${EXTRA_LIBS})
install(TARGETS MNN_Express
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
@ -25,3 +39,5 @@ IF(MNN_SEP_BUILD)
ELSE()
add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS})
ENDIF()

View File

@ -201,8 +201,9 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
info.type = type;
info.numThread = numberThread;
info.user = const_cast<BackendConfig*>(&config);
std::shared_ptr<Runtime> bn(creator->onCreate(info));
return std::shared_ptr<Executor>(new Executor(bn, type, numberThread));
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
auto executor = new Executor(runtime, type, numberThread);
return std::shared_ptr<Executor>(executor);
}
RuntimeInfo Executor::getRuntime() {

View File

@ -638,6 +638,7 @@ void* Variable::readInternal(bool forShape) {
return Executor::mapOutput(cache.get(), mFrom->mInside->mCacheOffset + mFromIndex, mFrom->mInside->mOutputTensors[mFromIndex]);
}
void Variable::informDirty() {
std::vector<Expr*> visited;
mFrom->visitOutputs([&visited](EXPRP expr, int index) {

View File

@ -8,6 +8,7 @@
#include "IfModule.hpp"
#include "MNN_generated.h"
namespace MNN {
namespace Express {
static int _findPos(const std::vector<std::string>& names, const std::string& key) {
@ -21,6 +22,8 @@ static int _findPos(const std::vector<std::string>& names, const std::string& ke
std::vector<Express::VARP> IfModule::onForward(const std::vector<Express::VARP>& inputs) {
std::vector<Express::VARP> outputs(mOutputFromElse.size());
MNN_ASSERT(mOutputFromThen.size() == mOutputFromElse.size());
if (inputs[0]->readMap<int>()[0] > 0) {
std::vector<Express::VARP> subInputs(mInputForThen.size());
for (auto& p : mInputForThen) {
@ -54,12 +57,14 @@ IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>&
if (nullptr != op->name()) {
module->setName(op->name()->str());
}
/** Compute map index
std::vector<std::pair<int, int>> mInputForThen;
// First mElse' index, Second: inputs's index
std::vector<std::pair<int, int>> mInputForElse;
std::vector<int> mOutputFromThen;
std::vector<int> mOutputFromElse;
*/
@ -98,7 +103,7 @@ IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>&
for (int i=0; i<output->size(); ++i) {
auto data = output->GetAs<StringVec>(i);
MNN_ASSERT(data->data()->size() == 2);
auto thenPos = _findPos(thenG.outputs, data->data()->GetAsString(0)->str());
MNN_ASSERT(thenPos >= 0);
auto elsePos = _findPos(elseG.outputs, data->data()->GetAsString(1)->str());

View File

@ -9,9 +9,9 @@
#include <MNN/expr/Module.hpp>
#include <MNN/expr/ExprCreator.hpp>
#include <MNN/expr/ExecutorScope.hpp>
#include "PipelineModule.hpp"
#include "core/FileLoader.hpp"
#include "backend/cpu/CPUBackend.hpp"
#include "MNN_generated.h"
#include "Utils.hpp"
#include "RuntimeAttr.hpp"
@ -202,15 +202,24 @@ public:
}
}
#endif // MNN_INTERNAL_ENABLED
if (nullptr == mInfo->runTimeManager.get()) {
mRuntime = Executor::getRuntime().second;
} else {
mRuntime = mInfo->runTimeManager->getInside()->mRuntime.first.begin()->second;
}
}
virtual ~ NetModule(){}
virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
#ifdef MNN_INTERNAL_ENABLED
Timer _time;
auto glo = ExecutorScope::Current();
glo->getDebugTools()->flops = 0.0f;
#endif
CPURuntime* runtime = static_cast<CPURuntime*>(mRuntime.get());
runtime->clearReuseCopyTensorMap();
auto outputs = mModule->onForward(inputs);
#ifdef MNN_INTERNAL_ENABLED
do {
@ -237,10 +246,18 @@ public:
} while(false);
#endif
return outputs;
}
void setRuntime(std::shared_ptr<Runtime> runtime) {
mRuntime = runtime;
}
virtual Module* clone(CloneContext* ctx) const override {
std::shared_ptr<Module> submodule(mModule->clone(ctx));
NetModule* module(new NetModule(submodule, mInfo, nullptr, 0, 0.0f));
module->setRuntime(Executor::getRuntime().second);
#ifdef MNN_INTERNAL_ENABLED
module->mLogInfo = mLogInfo;
#endif
@ -249,9 +266,11 @@ public:
const Module::Info* info() const {
return mInfo.get();
}
private:
std::shared_ptr<Module> mModule;
std::shared_ptr<Module::Info> mInfo;
std::shared_ptr<Runtime> mRuntime = nullptr;
#ifdef MNN_INTERNAL_ENABLED
std::map<std::string, std::string> mLogInfo;
#endif

View File

@ -250,6 +250,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
if (!res) {
return;
}
mResource->mUseContentInputs = scheduleInfo.needInputContentForShape;
if (mResource->mUseContentInputs) {
mResource->mModes.inputMode = Interpreter::Session_Input_User;
@ -262,6 +263,9 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
mInputTensors.resize(inputs.size());
for (int i = 0; i < inputs.size(); ++i) {
mInputTensors[i] = mSession->getInput(inputs[i].c_str());
#ifdef LOG_VERBOSE
MNN_PRINT("init Staticmodule %d th input ptr:%p, hostPtr:%p, name:%s\n", i, mInputTensors[i], mInputTensors[i]->host<void>(), inputs[i].c_str());
#endif
}
mOutputTensors.resize(mResource->mOutputFromTensor.size());
for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
@ -274,6 +278,7 @@ StaticModule::~StaticModule() {
mBackupResourceBackend = nullptr;
}
std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
AUTOTIME;
std::vector<Express::VARP> outputs(mResource->mOutputNumbers);
for (auto& iter : mResource->mOutputFromInput) {
@ -315,6 +320,18 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
}
mInputTensors[i]->buffer().host = inputTensor->buffer().host;
mInputTensors[i]->buffer().device = inputTensor->buffer().device;
if (mResource->mUseContentInputs) {
if (nullptr == mInputTensors[i]->buffer().host && 0 != mInputTensors[i]->buffer().device ) {
auto exprInfo = inputs[i]->expr();
auto inside = exprInfo.first->inside();
auto srcPtr = inputs[i]->readMap<void>();
mInputTensors[i]->buffer().host = inside->mHostTensor->buffer().host;
}
}
}
if (mResource->mUseContentInputs) {
mSession->setNeedResize();
@ -345,7 +362,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
continue;
}
auto exprInfo = inputs[i]->expr();
auto inside = exprInfo.first->inside();
auto inside = exprInfo.first->inside();
auto inputTensor = inside->mOutputTensors[exprInfo.second];
if (nullptr != inside->mCache) {
inputTensor = Executor::getOutput(inside->mCache.get(), inside->mCacheOffset);
@ -353,6 +370,19 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
mInputTensors[i]->copyFromHostTensor(inputTensor);
}
}
#ifdef LOG_VERBOSE
for (auto& inputTensor : mInputTensors) {
MNN_PRINT("static module, before run, input ptr:%p, hostPtr:%p, shape:", inputTensor, inputTensor->host<void>());
inputTensor->printShape();
MNN_PRINT("\n");
auto shape = inputTensor->shape();
}
MNN_PRINT("staticmodule before run\n");
#endif
ErrorCode code;
if (mResource->mModes.callBackMode == Interpreter::Session_Debug) {
auto globalExecutor = ExecutorScope::Current();
@ -372,6 +402,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
auto tensor = Tensor::clone(mOutputTensors[i]);
outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
}
#ifdef MNN_INTERNAL_ENABLED
auto glo = ExecutorScope::Current();
float flops = 0.0f;

View File

@ -94,7 +94,7 @@ protected:
typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opName*/)> TensorCallBack;
typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;
typedef std::pair< std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;
/**
* @brief get mnn version info.
@ -328,7 +328,7 @@ public:
/** Backends in session in M, int*, length >= 1 + number of configs when create session */
BACKENDS = 2,
/** Resize Info, int*, 0: ready to execute, 1: need malloc, 2: need resize */
RESIZE_STATUS = 3,

View File

@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
#define STR(x) STR_IMP(x)
#define MNN_VERSION_MAJOR 2
#define MNN_VERSION_MINOR 1
#define MNN_VERSION_PATCH 1
#define MNN_VERSION_PATCH 2
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
#endif /* MNNDefine_h */

View File

@ -43,7 +43,7 @@ public:
/** string handle type */
HANDLE_STRING = 1
};
/** Tensor map type : Read or Write*/
enum MapType {
/** map Tensor for writing data*/
@ -131,7 +131,7 @@ public:
* @param deepCopy whether create new content and copy, currently only support deepCopy = false
*/
static Tensor* clone(const Tensor* src, bool deepCopy = false);
/**
* @brief delete tensor.
* @param src tensor
@ -279,7 +279,7 @@ public:
* @brief print tensor data. for DEBUG use only.
*/
void print() const;
/**
*@brief print tensor shape
*/

View File

@ -147,7 +147,7 @@ public:
static void save(const std::vector<VARP>& vars, const char* fileName);
static std::vector<int8_t> save(const std::vector<VARP>& vars);
static void save(const std::vector<VARP>& vars, NetT* dest);
// Pack a few Variable to compute in one pipeline
static void prepareCompute(const std::vector<VARP>& vars, bool forceCPU = false);
static void compute(const std::vector<VARP>& vars, bool forceCPU = false);
@ -158,6 +158,7 @@ public:
mFrom = expr;
mFromIndex = index;
}
private:
Variable(EXPRP expr, int index) {
mFrom = expr;

View File

@ -736,6 +736,9 @@
C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C4F906B127688C3A0026B847 /* NMSModule.hpp */; };
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4F906B227688C3A0026B847 /* NMSModule.cpp */; };
C4FB6CB22769DF0800963B07 /* GeometryCumSum.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */; };
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
@ -1520,6 +1523,9 @@
C4F906B127688C3A0026B847 /* NMSModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = NMSModule.hpp; sourceTree = "<group>"; };
C4F906B227688C3A0026B847 /* NMSModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NMSModule.cpp; sourceTree = "<group>"; };
C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCumSum.cpp; sourceTree = "<group>"; };
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -1818,6 +1824,8 @@
48887410215B639D0079B12E /* cpu */ = {
isa = PBXGroup;
children = (
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */,
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */,
4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */,
4DCF538C2892B16400B5B393 /* CPUHistogram.hpp */,
4DF87C512887D3F20003E2D4 /* CPUSvd.hpp */,
@ -2642,6 +2650,7 @@
EBB38EC621E748B9005F76D7 /* shape */ = {
isa = PBXGroup;
children = (
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */,
4DCF538F2892B17000B5B393 /* ShapeHistogram.cpp */,
4DF87C4D2887D39B0003E2D4 /* ShapeSvd.cpp */,
48925F362744AC2A00919B37 /* ShapeROIAlign.cpp */,
@ -2871,6 +2880,7 @@
92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */,
92FF036623AA0B5A00AC97F6 /* CPUDetectionOutput.hpp in Headers */,
92FF04BC23AA0BFB00AC97F6 /* NonCopyable.hpp in Headers */,
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */,
48FA474B23AA127B00172C3B /* Utils.hpp in Headers */,
4A224A1427D0C56E000A9260 /* ConvolutionWinogradBridge.hpp in Headers */,
4D9A935926255BDA00F9B43C /* DataStructures.pb-c.h in Headers */,
@ -3422,6 +3432,7 @@
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
@ -3483,6 +3494,7 @@
4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */,
92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */,
92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,

View File

@ -48,7 +48,7 @@ def build_deps():
extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
extra_opts += ' -DMNN_BUILD_TORCH=ON ' if IS_BUILD_TORCH else ' '
os.system('cmake ' + extra_opts +
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Debug \
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4')
else:

View File

@ -1508,7 +1508,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
break;
case 4:
parse_res = PyArg_ParseTuple(args, "OOOl", &shape, &dataType, &data, &dimensionType)
&& isInts(shape) && isVals(data);
&& isInts(shape) && (isVals(data) || isInt(data));
break;
default:
parse_res = false;
@ -1518,7 +1518,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
"\t0. (Var)\n"
"\t1. (Tensor/Var, DimensionType)\n"
"\t2. ([int], DataType, DimensionType)\n"
"\t3. ([int], DataType, tuple/ndarray, DimensionType)\n");
"\t3. ([int], DataType, ndarray/list/tuple/bytes/PyCapsule/int_addr, DimensionType)\n");
return -1;
}
#ifdef PYMNN_EXPR_API
@ -1589,7 +1589,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
dataSize *= i;
}
void *pData = NULL;
if (data && !PyCapsule_CheckExact(data)) {
if (data && !PyCapsule_CheckExact(data) && !isInt(data)) {
if (PyBytes_Check(data)) {
int64_t total_len = PyBytes_Size(data);
if (dataSize * itemsize != total_len) {
@ -1626,15 +1626,20 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
}
} else {
// no data input, set all zeros
// pycapsule input, copy data
// pycapsule/int_addr input, copy data
pData = malloc(dataSize * itemsize);
if (data && PyCapsule_CheckExact(data)) {
auto src = PyCapsule_GetPointer(data, NULL);
if (src == nullptr) {
PyMNN_ERROR_LOG("PyMNNTensor_init: PyCapsule pointer is null.");
if (data) {
void* srcPtr = nullptr;
if (PyCapsule_CheckExact(data)) {
srcPtr = PyCapsule_GetPointer(data, NULL);
} else {
srcPtr = PyLong_AsVoidPtr(data);
}
if (srcPtr == nullptr) {
PyMNN_ERROR_LOG("PyMNNTensor_init: PyCapsule/int_addr pointer is null.");
return -1;
}
memcpy(pData, src, dataSize * itemsize);
memcpy(pData, srcPtr, dataSize * itemsize);
} else {
memset(pData, 0, dataSize * itemsize);
}

View File

@ -1243,10 +1243,10 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
PyObject *value, *shapes, *format = nullptr /* NCHW */, *type = nullptr /* DType_FLOAT */;
static char *kwlist[] = { "value_list", "shape", "data_format", "dtype", NULL };
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &value, &shapes, &format, &type)) {
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule, [ints], |data_format, dtype)");
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule/int_addr, [ints], |data_format, dtype)");
}
if (!isVals(value) || !isInts(shapes) || (format != nullptr && !isdata_format(format)) || (type != nullptr && !isdtype(type))) {
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule, [ints], |data_format, dtype)");
if ((!isVals(value) && !isInt(value)) || !isInts(shapes) || (format != nullptr && !isdata_format(format)) || (type != nullptr && !isdtype(type))) {
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule/int_addr, [ints], |data_format, dtype)");
}
auto data_format = (format == nullptr ? NCHW : toEnum<Dimensionformat>(format));
auto dtype = (type == nullptr ? DType_FLOAT : toEnum<DType>(type));
@ -1268,6 +1268,8 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
bool need_free = false;
if (PyCapsule_CheckExact(value)) {
data = PyCapsule_GetPointer(value, NULL);
} else if (isInt(value)) {
data = PyLong_AsVoidPtr(value);
} else if (PyBytes_Check(value)) {
int64_t bytesize = PyBytes_Size(value);
data = toPtr(value, DType_UINT8, bytesize);

View File

@ -77,6 +77,8 @@ class UnitTest(unittest.TestCase):
self.assertEqualArray(x.getNumpyData(), data)
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, data.tobytes(), MNN.Tensor_DimensionType_Tensorflow)
self.assertEqualArray(x.getNumpyData(), data)
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, data.__array_interface__['data'][0], MNN.Tensor_DimensionType_Tensorflow)
self.assertEqualArray(x.getNumpyData(), data)
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, mp.array([[1., 2.], [3., 4.]]).ptr, MNN.Tensor_DimensionType_Tensorflow)
self.assertEqualArray(x.getNumpyData(), data)
def test_image_process(self):
@ -274,6 +276,8 @@ class UnitTest(unittest.TestCase):
self.assertEqualVar(expr.const(list_data, [2, 2]), data)
self.assertEqualVar(expr.const(tuple_data, [2, 2]), data)
self.assertEqualVar(expr.const(data, [2, 2]), data)
self.assertEqualVar(expr.const(data.tobytes(), [2, 2]), data)
self.assertEqualVar(expr.const(data.__array_interface__['data'][0], [2, 2]), data)
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, (1., 2., 3., 4.), MNN.Tensor_DimensionType_Tensorflow)
self.assertEqualVar(expr.const(x.getHost(), [2, 2]), data)
def test_conv2d(self):

View File

@ -586,9 +586,11 @@ struct BinaryOpT : public flatbuffers::NativeTable {
typedef BinaryOp TableType;
int32_t opType;
DataType T;
int32_t activationType;
BinaryOpT()
: opType(0),
T(DataType_DT_FLOAT) {
T(DataType_DT_FLOAT),
activationType(0) {
}
};
@ -603,10 +605,14 @@ struct BinaryOp FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
DataType T() const {
return static_cast<DataType>(GetField<int32_t>(6, 1));
}
int32_t activationType() const {
return GetField<int32_t>(8, 0);
}
bool Verify(flatbuffers::Verifier &verifier) const {
return VerifyTableStart(verifier) &&
VerifyField<int32_t>(verifier, 4) &&
VerifyField<int32_t>(verifier, 6) &&
VerifyField<int32_t>(verifier, 8) &&
verifier.EndTable();
}
BinaryOpT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -623,6 +629,9 @@ struct BinaryOpBuilder {
void add_T(DataType T) {
fbb_.AddElement<int32_t>(6, static_cast<int32_t>(T), 1);
}
void add_activationType(int32_t activationType) {
fbb_.AddElement<int32_t>(8, activationType, 0);
}
explicit BinaryOpBuilder(flatbuffers::FlatBufferBuilder &_fbb)
: fbb_(_fbb) {
start_ = fbb_.StartTable();
@ -638,8 +647,10 @@ struct BinaryOpBuilder {
inline flatbuffers::Offset<BinaryOp> CreateBinaryOp(
flatbuffers::FlatBufferBuilder &_fbb,
int32_t opType = 0,
DataType T = DataType_DT_FLOAT) {
DataType T = DataType_DT_FLOAT,
int32_t activationType = 0) {
BinaryOpBuilder builder_(_fbb);
builder_.add_activationType(activationType);
builder_.add_T(T);
builder_.add_opType(opType);
return builder_.Finish();
@ -3481,6 +3492,7 @@ inline void BinaryOp::UnPackTo(BinaryOpT *_o, const flatbuffers::resolver_functi
(void)_resolver;
{ auto _e = opType(); _o->opType = _e; };
{ auto _e = T(); _o->T = _e; };
{ auto _e = activationType(); _o->activationType = _e; };
}
inline flatbuffers::Offset<BinaryOp> BinaryOp::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BinaryOpT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -3493,10 +3505,12 @@ inline flatbuffers::Offset<BinaryOp> CreateBinaryOp(flatbuffers::FlatBufferBuild
struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BinaryOpT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
auto _opType = _o->opType;
auto _T = _o->T;
auto _activationType = _o->activationType;
return MNN::CreateBinaryOp(
_fbb,
_opType,
_T);
_T,
_activationType);
}
inline PackParamT *PackParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@ -4932,17 +4946,19 @@ inline const flatbuffers::TypeTable *PadValueModeTypeTable() {
inline const flatbuffers::TypeTable *BinaryOpTypeTable() {
static const flatbuffers::TypeCode type_codes[] = {
{ flatbuffers::ET_INT, 0, -1 },
{ flatbuffers::ET_INT, 0, 0 }
{ flatbuffers::ET_INT, 0, 0 },
{ flatbuffers::ET_INT, 0, -1 }
};
static const flatbuffers::TypeFunction type_refs[] = {
DataTypeTypeTable
};
static const char * const names[] = {
"opType",
"T"
"T",
"activationType"
};
static const flatbuffers::TypeTable tt = {
flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, names
flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, names
};
return &tt;
}

View File

@ -35,6 +35,9 @@ enum BinaryOpOperation : byte {
table BinaryOp {
opType:int;
T:DataType=DT_FLOAT;
// 0 -> No Activation
// 1 -> Relu
activationType:int=0;
}
table PackParam {

View File

@ -182,12 +182,43 @@ ErrorCode CoreMLBinary::onResize(const std::vector<Tensor *> &inputs, const std:
MNN_ERROR("NPU Binary not support %s\n", MNN::EnumNameBinaryOpOperation(binaryType));
break;
}
if (oneInput) {
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(input)}, {mCoreMLBackend->getTensorName(outputs[0])});
std::string binartInputName;
if(oneInput) {
binartInputName = mCoreMLBackend->getTensorName(input);
} else {
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0]), mCoreMLBackend->getTensorName(inputs[1])}, {mCoreMLBackend->getTensorName(outputs[0])});
binartInputName = mCoreMLBackend->getTensorName(inputs[0]);
}
std::string binaryOutputName = mCoreMLBackend->getTensorName(outputs[0]);
int activationType = 0;
if(mOp->type() == OpType_BinaryOp) {
activationType = mOp->main_as_BinaryOp()->activationType();
}
if (activationType == 1) {
binaryOutputName = binartInputName + "-" + binaryOutputName + "-Relu";
}
if (oneInput) {
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(input)}, {binaryOutputName});
} else {
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0]), mCoreMLBackend->getTensorName(inputs[1])}, {binaryOutputName});
}
mCoreMLBackend->addLayer(mLayer_);
if (activationType == 1) {
auto reluLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
core_ml__specification__neural_network_layer__init(reluLayer);
mCoreMLBackend->setLayerName(reluLayer, "BinaryRelu");
reluLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ACTIVATION;
reluLayer->activation = mCoreMLBackend->create<CoreML__Specification__ActivationParams>();
core_ml__specification__activation_params__init(reluLayer->activation);
reluLayer->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_RE_LU;
reluLayer->activation->relu = mCoreMLBackend->create<CoreML__Specification__ActivationReLU>();
core_ml__specification__activation_re_lu__init(reluLayer->activation->relu);
setLayerInputsAndOutputs(reluLayer, {binaryOutputName}, {mCoreMLBackend->getTensorName(outputs[0])});
mCoreMLBackend->addLayer(reluLayer);
}
return NO_ERROR;
}

View File

@ -82,6 +82,9 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
ThreadPool::active();
}
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("create CPURuntime:%p\n", this);
#endif
}
CPURuntime:: ~ CPURuntime() {
#ifdef MNN_USE_THREAD_POOL
@ -96,6 +99,9 @@ float CPURuntime::onGetMemoryInMB() {
return staticMemoryInMB;
}
Backend* CPURuntime::onCreate(const BackendConfig* config) const {
auto precision = mPrecision;
size_t flags = mFlags;
@ -103,6 +109,10 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
precision = config->precision;
flags = config->flags;
}
#ifdef LOG_VERBOSE
MNN_PRINT("cpu backend was created by runtime:%p\n", this);
#endif
#ifdef MNN_USE_ARMV82
auto core = MNNGetCoreFunctions();
if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
@ -122,6 +132,7 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
return new AVX2Backend(this, flags);
}
#endif
return new CPUBackend(this, precision, MNN_FORWARD_CPU, flags);
}
@ -148,6 +159,21 @@ void CPURuntime::onGabageCollect(int level) {
mStaticAllocator->release(false);
}
ReuseCopyTensorMap& CPURuntime::getReuseCopyTensorMap() {
return mReuseCopyTensorMap;
}
void CPURuntime::clearReuseCopyTensorMap() {
for (auto& iter : mReuseCopyTensorMap) {
Tensor* tensor = std::get<2>(iter.second);
if (TensorUtils::getDescribe(tensor)->useCount > 0) {
TensorUtils::getDescribe(tensor)->useCount--;
}
}
mReuseCopyTensorMap.clear();
}
void CPURuntime::onConcurrencyBegin() const {
#ifdef MNN_USE_THREAD_POOL
if (mThreadNumber > 1 && mPower != BackendConfig::Power_High) {
@ -189,7 +215,10 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
}
CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type, size_t flags) : Backend(type) {
mRuntime = runtime;
#ifdef LOG_VERBOSE
MNN_PRINT("cpu backend create\n");
#endif
mRuntime = const_cast<CPURuntime*>(runtime);
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
mStaticAllocator = runtime->mStaticAllocator;
@ -485,6 +514,9 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
}
return exe;
}
const Runtime* CPUBackend::getRuntime() {
return mRuntime;
}
bool CPUBackend::onClearBuffer() {
mCache->reset();
@ -540,6 +572,16 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
wrapTensor->setType(dstType);
}
wrapTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(getTensorSize(wrapTensor.get()) * wrapTensor->getType().bytes(), MNN_MEMORY_ALIGN_DEFAULT);
#ifdef LOG_VERBOSE
MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [",
srcTensor, dstTensor, srcTensor->host<void>(), dstTensor->host<void>(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat);
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
}
MNN_PRINT("]\n");
#endif
TensorUtils::getDescribe(wrapTensor.get())->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
auto code = CPUCastCreator::cast(srcTensor, wrapTensor.get(), this, convertType);
if (NO_ERROR != code) {
@ -556,6 +598,15 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
}
}
ReuseCopyTensorMap& CPUBackend::getReuseCopyTensorMap() {
return mRuntime->getReuseCopyTensorMap();
}
void CPUBackend::clearReuseCopyTensorMap() {
mRuntime->clearReuseCopyTensorMap();
}
class CPURuntimeCreator : public RuntimeCreator {
public:
virtual Runtime* onCreate(const Backend::Info& info) const override {

View File

@ -29,6 +29,9 @@ public:
virtual CompilerType onGetCompilerType() const override {
return Compiler_Loop;
}
MNN_PUBLIC ReuseCopyTensorMap& getReuseCopyTensorMap();
MNN_PUBLIC void clearReuseCopyTensorMap();
void onConcurrencyBegin() const;
void onConcurrencyEnd() const;
@ -39,6 +42,7 @@ private:
BackendConfig::MemoryMode mMemory;
BackendConfig::PowerMode mPower;
BackendConfig::PrecisionMode mPrecision;
ReuseCopyTensorMap mReuseCopyTensorMap;
// Backend features
// CPU features
@ -71,7 +75,6 @@ public:
const CoreFunctions* functions() const {
return mCoreFunctions;
}
// Return element size for Tensor, conside pack
int getTensorSize(const Tensor* tensor, bool multiBytes = false) const;
const CoreInt8Functions* int8Functions() const {
@ -106,12 +109,22 @@ public:
CPUResizeCache* getCache() const {
return mCache;
}
virtual const Runtime* getRuntime() override;
ReuseCopyTensorMap& getReuseCopyTensorMap();
void clearReuseCopyTensorMap();
#ifdef MNN_USE_THREAD_POOL
inline int taskIndex() const {return mRuntime->mTaskIndex;}
#endif
static void initCreatorMap();
static int getBytes(const Backend* backend, const Tensor* output);
static DataType getDataType(const Tensor* tensor);
protected:
MemObj* allocBuffer(int size, Tensor* dest, StorageType storageType);
const CoreFunctions* mCoreFunctions;
@ -119,7 +132,7 @@ protected:
private:
std::shared_ptr<BufferAllocator> mStaticAllocator;
std::shared_ptr<BufferAllocator> mDynamicAllocator;
const CPURuntime* mRuntime;
CPURuntime* mRuntime;
BackendConfig::PrecisionMode mPrecisionMode;
static std::map<OpType, CPUBackend::Creator*>* gCreator;
std::map<const Tensor*, const Tensor*> mCachedCastTensor;

View File

@ -33,6 +33,11 @@ ErrorCode CPUBinary::onResize(const std::vector<Tensor*>& inputs, const std::vec
mTotalSize = input0DataCount;
}
MNN_ASSERT(mTotalSize == outputs[0]->elementSize());
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
mActivationExe.reset(new CPURelu(backend(), 0.0));
mActivationExe->onResize(outputs, outputs);
}
return NO_ERROR;
}
@ -59,7 +64,9 @@ ErrorCode CPUBinary::onExecute(const std::vector<Tensor*>& inputs, const std::ve
auto schedule = ((CPUBackend*)backend())->multiThreadDivide(mTotalSize);
auto input0Ptr = input->host<uint8_t>();
auto input1Ptr = input1->host<uint8_t>();
auto outputPtr = output->host<uint8_t>();
auto outputPtr = outputs[0]->host<uint8_t>();
int inpBytes = input->getType().bytes();
int outBytes = output->getType().bytes();
if (halide_type_float == input->getType().code) {
@ -85,9 +92,20 @@ ErrorCode CPUBinary::onExecute(const std::vector<Tensor*>& inputs, const std::ve
}
auto out = outputPtr + start * outBytes;
mProc(out, inp0, inp1, realSize, mNeedBroadcastIndex);
if(mActivationType == 1 && output->getType().code == halide_type_int) {
for(int i=0; i<realSize; i++) {
auto val = ((int32_t *)out)[i];
auto res = val > 0 ? val : 0;
((int32_t *)out)[i] = res;
}
}
}
}
MNN_CONCURRENCY_END();
if(mActivationType == 1 && output->getType().code == halide_type_float) {
mActivationExe->onExecute(outputs, outputs);;
}
return NO_ERROR;
}
@ -215,13 +233,13 @@ public:
if (nullptr == func) {
return nullptr;
}
return new CPUBinary(backend, func);
return new CPUBinary(backend, func, op->main_as_BinaryOp()->activationType());
} else if (dataType.code == halide_type_float) {
auto func = core->MNNSelectBinaryFunctionForFloat(type);
if (nullptr == func) {
return nullptr;
}
return new CPUBinary(backend, func);
return new CPUBinary(backend, func, op->main_as_BinaryOp()->activationType());
}
}
MNN_ERROR("CpuBinary: unsupported data type (bits: %d, code: %d)\n",

View File

@ -10,12 +10,14 @@
#define CPUBinary_hpp
#include "core/Execution.hpp"
#include "backend/cpu/CPURelu.hpp"
#include "compute/CommonOptFunction.h"
namespace MNN {
class CPUBinary : public Execution {
public:
CPUBinary(Backend *b, MNNBinaryExecute proc) : Execution(b) {
CPUBinary(Backend *b, MNNBinaryExecute proc, int activationType) : Execution(b) {
mProc = proc;
mActivationType = activationType;
}
virtual ~CPUBinary() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -26,6 +28,8 @@ private:
MNNBinaryExecute mProc;
int mNeedBroadcastIndex = -1;
int mTotalSize;
int mActivationType = 0;
std::shared_ptr<Execution> mActivationExe;
};
} // namespace MNN
#endif /* CPUBinary_hpp */

View File

@ -29,24 +29,13 @@ ErrorCode CPUHistogram::histogram(Tensor* input, Tensor* output) {
auto iptr = input->host<T>() + mChannel;
auto optr = output->host<float>();
memset(optr, 0, mBinNum * sizeof(float));
auto numberThread = ((CPUBackend*)backend())->threadNumber();
int sizeDivide = mSize / numberThread;
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
int number = sizeDivide;
if (tId == numberThread - 1) {
number = mSize - tId * sizeDivide;
}
auto src = iptr + tId * sizeDivide * mStride * sizeof(T);
auto dst = optr + tId * sizeDivide * mStride * sizeof(float);
for (int i = 0; i < number; i++) {
T val = src[i * mStride];
if (val >= mMin && val <= mMax) {
const int bin = (int)(val * mAlpha - mBeta);
dst[std::min(bin, mBinNum -1)]++;
}
for (int i = 0; i < mSize; i++) {
T val = iptr[i * mStride];
if (val >= mMin && val <= mMax) {
const int bin = (int)(val * mAlpha - mBeta);
optr[std::min(bin, mBinNum -1)]++;
}
}
MNN_CONCURRENCY_END();
return NO_ERROR;
}

View File

@ -54,6 +54,7 @@ extern void ___CPURasterFactory__OpType_While__();
extern void ___CPUConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
extern void ___CPURangeCreator__OpType_Range__();
extern void ___CPUTFQuantizedConv2DCreator__OpType_TfQuantizedConv2D__();
extern void ___CPUInterp3DCreator__OpType_Interp3D__();
extern void ___CPUQuantizedAvgPoolCreator__OpType_QuantizedAvgPool__();
extern void ___ConvolutionFactory__OpType_Convolution__();
extern void ___CPUConvInt8Creator__OpType_ConvInt8__();
@ -65,7 +66,6 @@ extern void ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
extern void ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
extern void ___CPUSvdCreator__OpType_Svd__();
extern void ___CPULayerNormCreator__OpType_LayerNorm__();
extern void ___CPUInterp3DCreator__OpType_Interp3D__();
void registerCPUOps() {
___CPUCropAndResizeCreator__OpType_CropAndResize__();
@ -122,6 +122,7 @@ ___CPURasterFactory__OpType_While__();
___CPUConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
___CPURangeCreator__OpType_Range__();
___CPUTFQuantizedConv2DCreator__OpType_TfQuantizedConv2D__();
___CPUInterp3DCreator__OpType_Interp3D__();
___CPUQuantizedAvgPoolCreator__OpType_QuantizedAvgPool__();
___ConvolutionFactory__OpType_Convolution__();
___CPUConvInt8Creator__OpType_ConvInt8__();
@ -133,6 +134,5 @@ ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
___CPUSvdCreator__OpType_Svd__();
___CPULayerNormCreator__OpType_LayerNorm__();
___CPUInterp3DCreator__OpType_Interp3D__();
}
}

View File

@ -21,9 +21,7 @@ template <typename T>
ErrorCode CPURange<T>::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
const T start = inputs[0]->host<T>()[0];
const T delta = inputs[2]->host<T>()[0];
int32_t outputSize = outputs[0]->buffer().dim[0].extent;
auto flat = outputs[0]->host<T>();
T val = start;
for (int32_t i = 0; i < outputSize; ++i) {

View File

@ -57,7 +57,7 @@ bool AVX2Functions::init(int cpuFlags) {
_AVX_ExtraInitFMA(coreFunction);
}
// For ImageProcess Functions
_SSE_ImageProcessInit(coreFunction);
_SSE_ImageProcessInit(coreFunction, cpuFlags);
#ifdef MNN_AVX512
if ((cpuFlags & libyuv::kCpuHasAVX512VNNI)
|| (cpuFlags & libyuv::kCpuHasAVX512VL)

View File

@ -69,7 +69,7 @@ void MNNFunctionInit() {
void MNNInt8FunctionInit() {
auto cpuFlags = libyuv::InitCpuFlags();
auto core = MNN::MNNGetInt8CoreFunctions();
if (cpuFlags & libyuv::kCpuHasSSSE3) {
if (cpuFlags & libyuv::kCpuHasSSE41) {
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
core->Int8GemmKernel = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit;

View File

@ -68,4 +68,4 @@ void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size);
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
void _SSE_ExtraInit(void* functions);
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
void _SSE_ImageProcessInit(void* functions);
void _SSE_ImageProcessInit(void* functions, int cpuFlags);

View File

@ -13,6 +13,7 @@
#include <algorithm>
#include <cmath>
// require SSE 4.1
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
const auto dst_step_tmp = dst_step / sizeof(int8_t);
@ -231,6 +232,7 @@ void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
}
}
// require SSE 4.1
void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint) {
__m128i zero = _mm_set1_epi32(0);
__m128 minValue = _mm_set1_ps(minV);
@ -325,6 +327,7 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
}
}
// require SSE 4.1
void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
auto dst = dstO;
auto src = (const int16_t*)srcO;

View File

@ -9,6 +9,7 @@
#include <algorithm>
#include "FunctionSummary.hpp"
#include "core/Macro.h"
#include "backend/cpu/x86_x64/cpu_id.h"
#define MNN_SSE_YUV_INIT \
countUnit -= 1;\
@ -245,6 +246,7 @@ void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t
}
}
// require SSE 4.1
void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count) {
int remain = 0;
int countC16 = count / 16;
@ -275,6 +277,7 @@ void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float*
}
}
// require SSE 4.1
void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal,
size_t count) {
int remain = 0;
@ -316,6 +319,7 @@ void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float*
}
}
// require SSE 4.1
void _SSE_MNNC1ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal,
size_t count) {
::memset(dest, 0, 4 * sizeof(float) * count);
@ -382,6 +386,7 @@ void _SSE_MNNC1ToFloatRGBA(const unsigned char* source, float* dest, const float
}
}
// require SSE 4.1
void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count) {
int remain = 0;
int countC4 = count / 4;
@ -424,14 +429,16 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float
}
}
void _SSE_ImageProcessInit(void* functions) {
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
if (cpuFlags & libyuv::kCpuHasSSE41) {
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
}
}

View File

@ -27,7 +27,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
auto p7 = _mm_set1_ps(parameters[7]);
auto xMax = _mm_set1_ps(87);
auto xMin = _mm_set1_ps(-87);
auto basic = _mm_set1_epi32(1 << 23);
// auto basic = _mm_set1_epi32(1 << 23);
for (int i = 0; i < count; ++i) {
auto x = _mm_mul_ps(_mm_loadu_ps(source + i * 4), A);
x = _mm_max_ps(x, xMin);
@ -36,7 +36,8 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
auto divInt = _mm_cvtps_epi32(div);
div = _mm_cvtepi32_ps(divInt);
auto div2 = _mm_add_epi32(divInt, _mm_set1_epi32(127));
div2 = _mm_mullo_epi32(div2, basic);
// div2 = _mm_mullo_epi32(div2, basic);
div2 = _mm_slli_epi32(div2, 23);
auto expBasic = _mm_castsi128_ps(div2);
auto xReamin = _mm_sub_ps(x, _mm_mul_ps(div, p0));
auto t = xReamin;
@ -89,7 +90,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
auto p7 = _mm_set1_ps(0.008333333333333333);
auto xMax = _mm_set1_ps(87);
auto xMin = _mm_set1_ps(-87);
auto basic = _mm_set1_epi32(1 << 23);
// auto basic = _mm_set1_epi32(1 << 23);
for (int i = 0; i < count; ++i) {
auto x = _mm_sub_ps(_mm_loadu_ps(source + i * 4), _mm_set1_ps(maxValue));
x = _mm_max_ps(x, xMin);
@ -98,7 +99,8 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
auto divInt = _mm_cvtps_epi32(div);
div = _mm_cvtepi32_ps(divInt);
auto div2 = _mm_add_epi32(divInt, _mm_set1_epi32(127));
div2 = _mm_mullo_epi32(div2, basic);
// div2 = _mm_mullo_epi32(div2, basic);
div2 = _mm_slli_epi32(div2, 23);
auto expBasic = _mm_castsi128_ps(div2);
auto xReamin = _mm_sub_ps(x, _mm_mul_ps(div, p0));
auto t = xReamin;

View File

@ -1,8 +1,14 @@
set(CUDA_MIN_VERSION "9.0")
find_package(CUDA ${CUDA_MIN_VERSION})
set (EXTRA_LIBS "")
if(MNN_CUDA_PROFILE)
set(EXTRA_LIBS -lnvToolsExt)
endif()
if(CUDA_FOUND)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_FORCE_INLINES -Wno-deprecated-gpu-targets -w")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_FORCE_INLINES -Wno-deprecated-gpu-targets -w ${EXTRA_LIBS}")
if(CMAKE_BUILD_TYPE MATCHES Debug)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O0")
else()
@ -15,6 +21,7 @@ if(CUDA_FOUND)
include(${CMAKE_CURRENT_SOURCE_DIR}/SelectCudaComputeArch.cmake)
CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${CUDA_ARCHS})
IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
ENDIF()
@ -34,13 +41,20 @@ else()
endif()
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*)
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!")
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}")
if(WIN32)
cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC})
set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE)
else()
cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
if(MNN_CUDA_PROFILE)
target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE)
target_link_libraries(MNN_Cuda_Main ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
endif()
set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE)
add_library(MNN_CUDA OBJECT Register.cpp)
endif()
@ -52,3 +66,5 @@ include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/../../../3rd_party/cutlass/include
)

View File

@ -18,6 +18,8 @@
#include "execution/Transpose.cuh"
#include "execution/MNNCUDADefine.hpp"
#include "CUDATools.hpp"
// #define MNN_CUDA_COPY_DEBUG
namespace MNN {
@ -47,6 +49,9 @@ private:
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
// TODO: Search CUDA Device info and use best one
mCUDARuntime.reset(new CUDARuntime(-1));
#ifdef LOG_VERBOSE
MNN_PRINT("create cuda runtime:%p\n", mCUDARuntime.get());
#endif
if (mCUDARuntime.get()) {
if (mCUDARuntime->isCreateError() == true) {
mIsCreateError = true;
@ -66,6 +71,9 @@ float CUDARuntimeWrapper::onGetMemoryInMB() {
}
Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
#ifdef LOG_VERBOSE
MNN_PRINT("cudaruntime:%p, create CUDABackend\n", this);
#endif
auto mode = mDefaultPrecision;
if (nullptr != config) {
mode = config->precision;
@ -78,9 +86,13 @@ void CUDARuntimeWrapper::onGabageCollect(int level) {
mBufferPool->release(false);
}
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32)
: Backend(MNN_FORWARD_CUDA) {
#ifdef LOG_VERBOSE
MNN_PRINT("cuda backend create\n");
#endif
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
mStaticBufferPool = st;
mCUDARuntime = rt;
@ -97,6 +109,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() {
MNN_ASSERT(nullptr != mCUDARuntime.get());
return mCUDARuntime.get();
}
const Runtime* CUDABackend::getRuntime() {
return (const Runtime*)mCUDARuntime.get();
}
bool CUDABackend::useFp16() const {
return mUseFp16AsFp32;
}
@ -128,6 +143,7 @@ CPUResizeCache* CUDABackend::getCache() {
}
Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
// MNN_PRINT("onAcquire CUDA memory for tensor:%p\n", nativeTensor);
#ifdef LOG_VERBOSE
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
#endif
@ -181,17 +197,16 @@ size_t CUDABackend::realSize(const Tensor* tensor) {
Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start CUDABackend::onCreate \n");
#endif
// #ifdef LOG_VERBOSE
// MNN_PRINT("Start CUDABackend::onCreate useFp16:%d\n", useFp16());
// #endif
auto creators = gCreator();
auto iter = creators->find(op->type());
if (iter == creators->end()) {
if (nullptr != op->name()) {
MNN_PRINT("Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
MNN_PRINT("CUDABackend Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
MNN_PRINT("Don't support type %s\n", EnumNameOpType(op->type()));
MNN_PRINT("CUDABackend Don't support type %s\n", EnumNameOpType(op->type()));
}
return NULL;
}
@ -199,9 +214,9 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (NULL == exe) {
if (nullptr != op->name()) {
MNN_PRINT("The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
MNN_PRINT("CUDABackend The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
} else {
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
MNN_PRINT("CUDABackend The Creator Don't support type %s\n", EnumNameOpType(op->type()));
}
return NULL;
}
@ -222,6 +237,7 @@ void CUDABackend::onExecuteBegin() const {
void CUDABackend::onExecuteEnd() const {
}
static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
srcStride[0] = plane * srcPack;
@ -283,9 +299,11 @@ static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch,
}
void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto srcIndex = TensorUtils::getDescribe(srcTensor)->index;
auto dstIndex = TensorUtils::getDescribe(dstTensor)->index;
auto srcDevice = srcTensor->deviceId() != 0;
auto dstDevice = dstTensor->deviceId() != 0;
MNN_ASSERT(srcDevice || dstDevice);
@ -293,18 +311,6 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
std::pair<void*, int> tempSrcStorage;
auto bytes = getBytes(srcTensor);
auto type = srcTensor->getType();
#ifdef MNN_CUDA_COPY_DEBUG
checkKernelErrors;
MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat);
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
if(srcDevice && !dstDevice) {
printf("\n");
}
}
MNN_PRINT("], ");
MNN_PRINT("addr:%p %p\n", srcTensor->deviceId(), dstTensor->deviceId());
#endif
//printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
@ -315,17 +321,40 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
}
}
}
#ifdef MNN_CUDA_COPY_DEBUG
checkKernelErrors;
MNN_PRINT("CUDA Bn copy tensor ptr:%p -> ptr:%p deviceId:%d -> %d, hostPtr:%p -> %p, graphIndex: %d -> %d, format %d -> %d, directCopy: %d, dims: [",
srcTensor, dstTensor, srcTensor->deviceId(), dstTensor->deviceId(), srcTensor->host<void>(), dstTensor->host<void>(), srcIndex, dstIndex, srcDimensionFormat, dstDimensionFormat, directCopy);
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
if(srcDevice && !dstDevice) {
printf("\n");
}
}
MNN_PRINT("], ");
MNN_PRINT("addr:%p %p\n", srcTensor->deviceId(), dstTensor->deviceId());
#endif
if (directCopy) {
auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
if (srcDevice && dstDevice) {
NVTX_PUSH("DtoD");
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToDevice, true);
NVTX_POP();
} else if (srcDevice && (!dstDevice)) {
NVTX_PUSH("DtoH");
mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToHost, true);
NVTX_POP();
} else if ((!srcDevice) && (dstDevice)) {
NVTX_PUSH("HtoD");
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
MNNMemcpyHostToDevice, true);
NVTX_POP();
}
return;
}
@ -348,6 +377,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
dstPtr = (uint8_t*)dstTensor->deviceId();
}
NVTX_PUSH("copy convert");
// Format convert
int batch, plane, channel;
_computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
@ -368,8 +398,9 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
auto cpuSize = dstTensor->size();
mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
true);
mStaticBufferPool->free(tempDstStorage);
mStaticBufferPool->free(tempDstStorage);
}
NVTX_POP();
return;
}

View File

@ -36,7 +36,7 @@ public:
private:
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime;
std::shared_ptr<CUDARuntime> mCUDARuntime;
bool mIsCreateError{false};
BackendConfig::PrecisionMode mDefaultPrecision;
};
@ -47,6 +47,7 @@ public:
~CUDABackend();
CUDARuntime *getCUDARuntime();
virtual const Runtime* getRuntime() override;
virtual Backend::MemObj* onAcquire(const Tensor *nativeTensor, StorageType storageType) override;
virtual bool onClearBuffer() override;

View File

@ -0,0 +1,167 @@
// CUDATools.hpp
// MNN
//
// Created by MNN on b'2022/09/05'.
// Copyright © 2022, Alibaba Group Holding Limited
//
#ifndef CUDATools_hpp
/*
use nvprof by open the MACRO 'MNN_CUDA_PROFILE'.
cmake .. -DMNN_CUDA_PROFILE=ON -DMNN_CUDA=ON
*/
#ifdef MNN_CUDA_PROFILE
#include "nvToolsExt.h"
#define NVTX_PUSH(...) nvtxRangePushA(__VA_ARGS__)
#define NVTX_POP(...) nvtxRangePop(__VA_ARGS__)
#else
#define NVTX_PUSH(...)
#define NVTX_POP(...)
#endif
#ifdef MNN_CUDA_PROFILE
#include <stdio.h>
#include <iostream>
#include <memory>
#include <sstream>
#include <vector>
#include <cxxabi.h>
#include <execinfo.h>
// print stack trace log
class MNNLogMessage {
public:
MNNLogMessage(const char* file, int line, const char* function)
:
log_stream_(std::cout)
{
const char* pos = file;
const char* onlyName = file;
while (*pos != 0) {
if (*pos == '/') {
onlyName = pos + 1;
}
pos++;
}
log_stream_ << onlyName << ":"
<< line << ":"
<< function << ": ";
}
~MNNLogMessage() {
log_stream_ << '\n';
}
std::ostream& stream() {
return log_stream_;
}
std::string GetCachedString() {
return "";
}
protected:
std::ostream& log_stream_;
private:
MNNLogMessage(const MNNLogMessage&);
void operator=(const MNNLogMessage&);
};
#if !defined(MNN_BUILD_FOR_ANDROID)
inline std::string
Demangle(char const* msg_str, std::ostringstream& os) {
using std::string;
string msg(msg_str);
size_t symbol_start = string::npos;
size_t symbol_end = string::npos;
if (((symbol_start = msg.find("_Z")) != string::npos) &&
(symbol_end = msg.find_first_of(" +", symbol_start))) {
string left_of_symbol(msg, 0, symbol_start);
string symbol(msg, symbol_start, symbol_end - symbol_start);
string right_of_symbol(msg, symbol_end);
int status = 0;
size_t length = string::npos;
std::unique_ptr<char, void (*)(void* __ptr)> demangled_symbol = {
abi::__cxa_demangle(symbol.c_str(), 0, &length, &status), &std::free};
if (demangled_symbol && status == 0 && length > 0) {
string symbol_str(demangled_symbol.get());
os << left_of_symbol << symbol_str << right_of_symbol;
return os.str();
}
}
return string(msg_str);
}
// By default skip the first frame because
// that belongs to ~FatalLogMessage
inline std::string
StackTrace(size_t start_frame = 2, const size_t stack_size = 12) {
using std::string;
std::ostringstream stacktrace_os;
std::vector<void*> stack(stack_size);
int nframes = backtrace(stack.data(), static_cast<int>(stack_size));
stacktrace_os << "Stack trace:\n";
char** msgs = backtrace_symbols(stack.data(), nframes);
if (msgs != nullptr) {
for (int frameno = start_frame; frameno < nframes; ++frameno) {
stacktrace_os << " [bt] (" << frameno - start_frame << ") ";
string msg = Demangle(msgs[frameno], stacktrace_os);
stacktrace_os << "\n";
}
}
free(msgs);
string stack_trace = stacktrace_os.str();
return stack_trace;
}
class TraceMNNLogMessage : public MNNLogMessage {
public:
TraceMNNLogMessage(const char* file, int line, const char* function) : MNNLogMessage(file, line, function) {}
~TraceMNNLogMessage() {
log_stream_ << "\n" << StackTrace(1);
}
};
#define MNN_LOG_TRACE TraceMNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
#else
#define MNN_LOG_TRACE MNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
#endif
#define MNN_LOG_INFO MNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
#define MNN_LOG(severity) MNN_LOG_##severity.stream()
#define MNN_LOG_IF(severity, condition) \
!(condition) ? (void)0 : MNNLogMessageVoidify() & MNN_LOG(severity)
// deal with release/debug
// release mode
#if !defined(NDEBUG)
#define MNN_DLOG(severity) MNN_LOG(severity)
#define MNN_DLOG_IF(severity, condition) MNN_LOG_IF(severity, (condition))
// debug mode, all MNN_DLOG() code would be compiled as empty.
#else
#define MNN_DLOG(severity) MNN_LOG_IF(severity, false)
#define MNN_DLOG_IF(severity, condition) MNN_LOG_IF(severity, false)
#endif
// This class is used to explicitly ignore values in the conditional
// logging macros. This avoids compiler warnings like "value computed
// is not used" and "statement has no effect".
class MNNLogMessageVoidify {
public:
MNNLogMessageVoidify() {}
// This has to be an operator with a precedence lower than << but
// higher than "?:". See its usage.
void operator&(std::ostream&) {}
};
#endif
#endif

View File

@ -15,14 +15,9 @@
#include <utility>
#include <vector>
#include "core/Macro.h"
// #define MNN_CUDA_USE_BLAS
//#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp>
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
#undef STR
#undef STR_HELPER
namespace MNN {
@ -32,7 +27,7 @@ bool CUDARuntime::isCreateError() const {
CUDARuntime::CUDARuntime(int device_id) {
#ifdef LOG_VERBOSE
MNN_PRINT("start CUDARuntime !\n");
MNN_PRINT("start CUDARuntime id:%d\n", device_id);
#endif
int version;
cuda_check(cudaRuntimeGetVersion(&version));
@ -40,6 +35,7 @@ CUDARuntime::CUDARuntime(int device_id) {
if (id < 0) {
cuda_check(cudaGetDevice(&id));
}
// printf("use GPU device id:%d\n", id);
// id = selectDeviceMaxFreeMemory();
// cuda_check(cudaSetDevice(id));
@ -68,23 +64,24 @@ int CUDARuntime::selectDeviceMaxFreeMemory() {
cudaDeviceProp deviceProp;
int deviceCount;
cuda_check(cudaGetDeviceCount(&deviceCount));
// Check id:0 card info
int id = 0;
cuda_check(cudaSetDevice(0));
size_t total_size, free_size_max;
cuda_check(cudaMemGetInfo(&free_size_max, &total_size));
//printf("card:0, free:%zu, total:%zu\n", free_size_max, total_size);
size_t total_size = 0, free_size_max = 0;
cudaError_t memStatus = cudaMemGetInfo(&free_size_max, &total_size);
cuda_check(memStatus);
// printf("card:0, free:%zu, total:%zu, memStatusSuccess:%d\n", free_size_max, total_size, memStatus == cudaSuccess);
for(int i = 1; i < deviceCount; i++) {
cuda_check(cudaSetDevice(i));
size_t free_size;
cuda_check(cudaMemGetInfo(&free_size, &total_size));
cuda_check(cudaMemGetInfo(&free_size, &total_size));
if(free_size > free_size_max) {
free_size_max = free_size;
id = i;
}
//printf("card:%d, free:%zu, total:%zu\n", i, free_size, total_size);
}
// printf("card:%d, free:%zu, total:%zu\n", i, free_size, total_size);
}
return id;
}
@ -102,7 +99,7 @@ size_t CUDARuntime::blocks_num(const size_t total_threads) {
// } else {
// mThreadPerBlock = 128;
// }
mThreadPerBlock = 128;
return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
}
@ -170,4 +167,7 @@ void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) {
cuda_check(cudaMemset(dst, value, size_in_bytes));
checkKernelErrors;
}
} // namespace MNN

View File

@ -1,4 +1,5 @@
#include "ArgMaxExecution.hpp"
#include "ArgMinExecution.hpp"
#include "core/TensorUtils.hpp"
namespace MNN {
@ -14,9 +15,9 @@ __global__ void ARGMAX(const int count, const int outside, const int inside, con
int* outPtr = output + inside * o;
const T* inpPtr = input + inside * dim * o;
int index = 0;
T maxValue = inpPtr[n+0*inside];
T maxValue = inpPtr[n + 0 * inside];
for(int j=1; j<dim; j++) {
T value = inpPtr[n+j*inside];
T value = inpPtr[n + j * inside];
if(maxValue < value) {
index = j;
maxValue = value;
@ -24,6 +25,7 @@ __global__ void ARGMAX(const int count, const int outside, const int inside, con
}
outPtr[n] = index;
}
return;
}
ArgMaxExecution::ArgMaxExecution(const Op* op, Backend *backend) : Execution(backend) {
@ -60,10 +62,10 @@ ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
auto input = (void *)inputs[0]->deviceId();
auto output = (void *)outputs[0]->deviceId();
int count = mOutside * mInside;
int block_num = runtime->blocks_num(count);
int thread_num = runtime->threads_num();
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
if(bytes == 4) {
@ -73,7 +75,7 @@ ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
ARGMAX<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const half*)input,(int *)output);
checkKernelErrors;
}
return NO_ERROR;
}
class ArgMaxCreator : public CUDABackend::Creator {
@ -84,10 +86,17 @@ public:
if (TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
return nullptr;
}
return new ArgMaxExecution(op, backend);
if (op->type() == OpType_ArgMax) {
return new ArgMaxExecution(op, backend);
} else {
return new ArgMinExecution(op, backend);
}
}
};
static CUDACreatorRegister<ArgMaxCreator> __init(OpType_ArgMax);
static CUDACreatorRegister<ArgMaxCreator> __init_op2(OpType_ArgMin);
}
}

View File

@ -0,0 +1,90 @@
//
// ArgMinExecution.cpp
// MNN
//
// Created by MNN on 2022/06/29.
// Copyright © 2018 - 2022, Alibaba Group Holding Limited
//
#include "ArgMinExecution.hpp"
#include "core/TensorUtils.hpp"
#include <MNN/AutoTime.hpp>
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void ARGMIN(const int count, const int outside, const int inside, const int dim,
const T *input, int *output) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
const int o = i / inside;
const int n = i % inside;
int* outPtr = output + inside * o;
const T* inpPtr = input + inside * dim * o;
int index = 0;
T minValue = inpPtr[n + 0 * inside];
for(int j=1; j<dim; j++) {
T value = inpPtr[n + j * inside];
if(minValue > value) {
index = j;
minValue = value;
}
}
outPtr[n] = index;
}
return;
}
ArgMinExecution::ArgMinExecution(const Op* op, Backend *backend) : Execution(backend) {
mOp = op;
mAxis = mOp->main_as_ArgMax()->axis();
}
ArgMinExecution::~ArgMinExecution(){
// Do nothing
}
ErrorCode ArgMinExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = inputs[0];
auto output = outputs[0];
if (mAxis < 0) {
mAxis = input->dimensions() + mAxis;
}
mInside = 1;
mOutside = 1;
for (int i=0; i<mAxis; ++i) {
mOutside *= input->length(i);
}
for (int i=mAxis+1; i<input->dimensions(); ++i) {
mInside *= input->length(i);
}
mDim = input->length(mAxis);
return NO_ERROR;
}
ErrorCode ArgMinExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
auto input = (void *)inputs[0]->deviceId();
auto output = (void *)outputs[0]->deviceId();
int count = mOutside * mInside;
int block_num = runtime->blocks_num(count);
int thread_num = runtime->threads_num();
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
if(bytes == 4) {
ARGMIN<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const float*)input,(int *)output);
checkKernelErrors;
} else {
ARGMIN<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const half*)input,(int *)output);
checkKernelErrors;
}
return NO_ERROR;
}
}
}

View File

@ -0,0 +1,33 @@
//
// ArgMinExecution.hpp
// MNN
//
// Created by MNN on 2022/06/29.
// Copyright © 2018 - 2022, Alibaba Group Holding Limited
//
#ifndef ArgMinExecution_hpp
#define ArgMinExecution_hpp
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
class ArgMinExecution : public Execution {
public:
ArgMinExecution(const Op* op, Backend *backend);
virtual ~ArgMinExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
const Op* mOp;
int mAxis;
int mInside;
int mOutside;
int mDim;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -32,8 +32,9 @@ __global__ void LOGICALOR(const T *input0, const T* input1, T *output, size_t co
}
return;
}
BinaryExecution::BinaryExecution(int opType, Backend *backend) : Execution(backend) {
BinaryExecution::BinaryExecution(int opType, Backend *backend, int activationType) : Execution(backend) {
mType = opType;
mActivationType = activationType;
}
BinaryExecution::~BinaryExecution(){
// Do nothing
@ -59,7 +60,7 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
auto input0 = (uint8_t*)input0T->deviceId();
auto input1 = (uint8_t*)input1T->deviceId();
auto output = (uint8_t*)outputT->deviceId();
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType);
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType, mActivationType);
};
computeFunction(inputs[0], inputs[1], outputs[0]);
for (int i=2; i<inputs.size(); ++i) {
@ -72,7 +73,8 @@ public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (op->type() == OpType_BinaryOp) {
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend);
//MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType());
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType());
}
if (op->type() == OpType_Eltwise) {
switch (op->main_as_Eltwise()->type()) {

View File

@ -15,12 +15,13 @@ namespace MNN {
namespace CUDA {
class BinaryExecution : public Execution {
public:
BinaryExecution(int opType, Backend *backend);
BinaryExecution(int opType, Backend *backend, int activationType = 0);
virtual ~BinaryExecution();
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
int mType;
int mActivationType;
};
} // namespace CUDA
} // namespace MNN

View File

@ -8,8 +8,6 @@
#include "ConvCutlassExecution.hpp"
#include "Raster.cuh"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
//#define DEBUG
@ -230,8 +228,8 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
mGemmInfo.elh[1] = l;
mGemmInfo.elh[2] = h;
mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8;
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;;
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;;
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;
//MNN_PRINT("Activate:%d \n", mActivationType);
//MNN_PRINT("Im2Col%d-%d-%d temp size:%zu!!!\n\n",output->width(), ic, mIm2ColParamter.kernelX, (size_t)sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
@ -283,10 +281,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Relu_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF16ReluSm70.can_implement(arguments);
@ -307,10 +305,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Relu_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF32ReluSm70.can_implement(arguments);
@ -335,10 +333,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Relu6_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF16Relu6Sm70.can_implement(arguments);
@ -359,10 +357,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Relu6_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF32Relu6Sm70.can_implement(arguments);
@ -385,10 +383,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Linear_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
cutlass::Status status = mGemmF16LnSm70.can_implement(arguments);
cutlass_check(status);
@ -406,10 +404,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Linear_Sm70::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
cutlass::Status status = mGemmF32LnSm70.can_implement(arguments);
cutlass_check(status);
@ -435,10 +433,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Relu_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF16ReluSm75.can_implement(arguments);
@ -459,10 +457,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Relu_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF32ReluSm75.can_implement(arguments);
@ -487,10 +485,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Relu6_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF16Relu6Sm75.can_implement(arguments);
@ -511,10 +509,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Relu6_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmF32Relu6Sm75.can_implement(arguments);
@ -537,10 +535,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F16_Linear_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
cutlass::Status status = mGemmF16LnSm75.can_implement(arguments);
cutlass_check(status);
@ -558,10 +556,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
split_k_slices}; // <- k-dimension split factor
size_t workspace_size = Gemm_F32_Linear_Sm75::get_workspace_size(arguments);
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(buffer3);
pool->free(bufferWs);
cutlass::Status status = mGemmF32LnSm75.can_implement(arguments);
cutlass_check(status);

View File

@ -11,15 +11,12 @@
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include "CutlassGemmParam.hpp"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
struct CutlassGemmInfo{
int elh[3];
int elhPad[3];
};
class ConvCutlassExecution : public Execution {
public:
struct Resource {

View File

@ -13,7 +13,10 @@ __global__ void CONV_DW(const T* input,
const half* kernel,
const half* bias,
T *output,
const constBuffer* uConstant
const constBuffer* uConstant,
DivModFast d_oc,
DivModFast d_ow,
DivModFast d_oh
) {
float maxV = uConstant->maxValue;
float minV = uConstant->minValue;
@ -32,17 +35,18 @@ __global__ void CONV_DW(const T* input,
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
int i = index / c_p;
int oz = index % c_p;
int ob = i / (ow * oh);
int tmp = i % (ow * oh);
int oy = tmp / ow;
int ox = tmp % ow;
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/2; index += blockDim.x * gridDim.x) {
int oz_2, tmp2, oy, ox, tmp1, ob;
d_oc.divmod(index, tmp1, oz_2);
d_ow.divmod(tmp1, tmp2, ox);
d_oh.divmod(tmp2, ob, oy);
int oz = oz_2 << 1;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
float color = bias[oz];
float color0 = bias[oz];
float color1 = bias[oz+1];
int fxSta = max(0, (UP_DIV(-ix, dw)));
int fySta = max(0, (UP_DIV(-iy, dh)));
int fxEnd = min(kw, UP_DIV(iw - ix, dw));
@ -52,32 +56,190 @@ __global__ void CONV_DW(const T* input,
int sy = fy*dh + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx*dw + ix;
float inp = input[0
+ sx * c_p
+ sy * iw * c_p
+ ob * iw * ih * c_p
+ oz
];
float ker = kernel[0
+ fx
+ fy * kw
+ oz * kw * kh
];
color = color + inp * ker;
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
float inp0 = input[src_offset];
float inp1 = input[src_offset+1];
float ker0 = kernel[(fy * kw + fx) * c_p + oz];
float ker1 = kernel[(fy * kw + fx) * c_p + oz + 1];
color0 = color0 + inp0 * ker0;
color1 = color1 + inp1 * ker1;
}
}
color = max(color, minV);
color = min(color, maxV);
color0 = max(color0, minV);
color0 = min(color0, maxV);
output[0
+ ox * c_p
+ oy * ow * c_p
+ ob * ow * oh * c_p
+ oz
] = color;
color1 = max(color1, minV);
color1 = min(color1, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
output[dst_offset] = color0;
output[dst_offset+1] = color1;
}
}
__global__ void CONV_DW_HALF2_OPT(const half2* input,
const half2* kernel,
const half2* bias,
half2 *output,
const constBuffer* uConstant,
DivModFast d_oc,
DivModFast d_ow,
DivModFast d_oh
) {
float maxV = uConstant->maxValue;
float minV = uConstant->minValue;
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int c_p = c * PACK_NUMBER / 2;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/2; index += blockDim.x * gridDim.x) {
int oz_2, tmp2, oy, ox, tmp1, ob;
d_oc.divmod(index, tmp1, oz_2);
d_ow.divmod(tmp1, tmp2, ox);
d_oh.divmod(tmp2, ob, oy);
int oz = oz_2;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
half2 color = bias[oz];
int fxSta = max(0, -ix);
int fySta = max(0, -iy);
int fxEnd = min(kw, iw - ix);
int fyEnd = min(kh, ih - iy);
int fx, fy, fz;
for (fy=fySta; fy<fyEnd; ++fy) {
int sy = fy + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx + ix;
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
half2 inp = input[src_offset];
half2 ker = kernel[(fy * kw + fx) * c_p + oz];
color = __hfma2(inp, ker, color);
}
}
color.x = max(color.x, minV);
color.x = min(color.x, maxV);
color.y = max(color.y, minV);
color.y = min(color.y, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
output[dst_offset] = color;
}
}
__global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
const half2* kernel,
const half2* bias,
half2 *output,
const constBuffer* uConstant,
DivModFast d_oc,
DivModFast d_ow,
DivModFast d_oh
) {
float maxV = uConstant->maxValue;
float minV = uConstant->minValue;
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int c_p = c * PACK_NUMBER / 2;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/4; index += blockDim.x * gridDim.x) {
int oz_2, tmp2, oy, ox_2, tmp1, ob;
d_oc.divmod(index, tmp1, oz_2);
d_ow.divmod(tmp1, tmp2, ox_2);
d_oh.divmod(tmp2, ob, oy);
int oz = oz_2;
int ox = ox_2 << 1;
int ix = ox - 1;
int iy = oy - 1;
half2 color0 = bias[oz];
half2 color1 = color0;
half2 zero;
zero.x = (half)0.0;
zero.y = (half)0.0;
half2 inp[12];
half2 ker[3][3];
for(int j=0; j<3; j++) {
if(iy < 0 && j==0) {
for(int i=0; i<4; i++) {
inp[i] = zero;
}
continue;
}
if(iy+2 > ih-1 && j==2) {
for(int i=0; i<4; i++) {
inp[8+i] = zero;
}
continue;
}
for(int i=0; i<4; i++) {
if(ix < 0 && i==0) {
for(int j=0; j<3; j++) {
inp[4*j+0] = zero;
}
continue;
}
if(ix+3 > iw-1 && i==3) {
for(int j=0; j<3; j++) {
inp[4*j+3] = zero;
}
continue;
}
int src_offset = ((ob * ih + iy+j) * iw + ix+i) * c_p + oz;
inp[4*j+i] = input[src_offset];
}
}
for(int j=0; j<3; j++) {
for(int i=0; i<3; i++) {
ker[j][i] = kernel[(j * 3 + i) * c_p + oz];
}
}
for(int j=0; j<3; j++) {
for(int i=0; i<3; i++) {
color0 = __hfma2(inp[4*j+i], ker[j][i], color0);
color1 = __hfma2(inp[4*j+i+1], ker[j][i], color1);
}
}
color0.x = max(color0.x, minV);
color0.x = min(color0.x, maxV);
color0.y = max(color0.y, minV);
color0.y = min(color0.y, maxV);
color1.x = max(color1.x, minV);
color1.x = min(color1.x, maxV);
color1.y = max(color1.y, minV);
color1.y = min(color1.y, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
output[dst_offset] = color0;
output[dst_offset+c_p] = color1;
}
}
__global__ void CONV_DW_OPT(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant,
DivModFast d_oc,
@ -88,6 +250,8 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
float minV = uConstant->minValue;
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int sw = uConstant->stride[0];
@ -97,15 +261,18 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
int c = uConstant->channel;
int c_p = c * PACK_NUMBER;
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
int oz, tmp2, oy, ox, tmp1, ob;
d_oc.divmod(index, tmp1, oz);
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total / 2; index += blockDim.x * gridDim.x) {
int oz_2, tmp2, oy, ox, tmp1, ob;
d_oc.divmod(index, tmp1, oz_2);
d_ow.divmod(tmp1, tmp2, ox);
d_oh.divmod(tmp2, ob, oy);
int oz = oz_2 << 1;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
float color = bias[oz];
float color0 = bias[oz];
float color1 = bias[oz+1];
int fxSta = max(0, -ix);
int fySta = max(0, -iy);
int fxEnd = min(kw, iw - ix);
@ -115,25 +282,28 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
int sy = fy + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx + ix;
float inp = input[0
+ sx * c_p
+ sy * iw * c_p
+ ob * iw * ih * c_p
+ oz
];
float ker = kernel[0
+ fx
+ fy * kw
+ oz * kw * kh
];
color = color + inp * ker;
}
}
color = max(color, minV);
color = min(color, maxV);
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
float inp0 = input[src_offset];
float inp1 = input[src_offset+1];
output[index] = color;
}
float ker0 = kernel[(fy * kw + fx) * c_p + oz];
float ker1 = kernel[(fy * kw + fx) * c_p + oz + 1];
color0 = color0 + inp0 * ker0;
color1 = color1 + inp1 * ker1;
}
}
color0 = max(color0, minV);
color0 = min(color0, maxV);
color1 = max(color1, minV);
color1 = min(color1, maxV);
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
output[dst_offset] = color0;
output[dst_offset+1] = color1;
}
}
static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op* op, Backend* bn) {
@ -162,21 +332,22 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
int weightSize = 0;
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
auto tempWeightStorage = pool->alloc(weightSize * sizeof(float));
auto tempWeightStorage = pool->alloc(depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float));
auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
reg.size[0] = 1;
reg.size[1] = depthC * PACK_NUMBER;
reg.size[2] = kernelY * kernelX;
reg.size[1] = kernelY * kernelX;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = kernelY * kernelX;
reg.srcStride[2] = 1;
reg.srcStride[1] = 1;
reg.srcStride[2] = kernelY * kernelX;
reg.dstStride[0] = 0;
reg.dstStride[1] = kernelY * kernelX;
reg.dstStride[1] = depthC * PACK_NUMBER;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = depth;
offset[2] = kernelY * kernelX;
offset[1] = kernelY * kernelX;
offset[2] = depth;
offset[3] = 0;
offset[4] = 1;
offset[5] = reg.size[1];
@ -276,7 +447,7 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
mTotalCount = parameters.total;
//printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
//MNN_PRINT("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
return NO_ERROR;
}
@ -287,10 +458,31 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
int threads_num = ALIMIN(prop.maxThreadsPerBlock/2, limitThreads);
int block_num = prop.multiProcessorCount;
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
DivModFast d_oc(parameters.channel * PACK_NUMBER / 2);
DivModFast d_ow(parameters.outputSize[0]);
DivModFast d_oh(parameters.outputSize[1]);
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if (inputs.size() == 1) {
if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) {
DivModFast d_ow2(parameters.outputSize[0]/2);
CONV_DW3x3_HALF2_OPT<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
(const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_oc, d_ow2, d_oh);
checkKernelErrors;
return NO_ERROR;
}
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
CONV_DW_HALF2_OPT<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
(const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_oc, d_ow, d_oh);//_HALF_OPT
checkKernelErrors;
} else {
CONV_DW<<<block_num, threads_num>>>((const half*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (half*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
(const half*)mResource->mBias, (half*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_oc, d_ow, d_oh);
checkKernelErrors;
}
return NO_ERROR;
}
@ -298,18 +490,16 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
if (inputs.size() == 1) {
// block_num = runtime->blocks_num(mTotalCount);
// threads_num = runtime->threads_num();
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
const int area = parameters.outputSize[0] * parameters.outputSize[1];
DivModFast d_oc(parameters.channel * PACK_NUMBER);
DivModFast d_ow(parameters.outputSize[0]);
DivModFast d_oh(parameters.outputSize[1]);
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_oc, d_ow, d_oh);
checkKernelErrors;
} else {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_oc, d_ow, d_oh);
checkKernelErrors;
}
}
return NO_ERROR;

View File

@ -13,6 +13,9 @@
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
#include "common/MemoryFormater.h"
#include "backend/cuda/core/CUDATools.hpp"
// 16 / sizeof(int4)
namespace MNN {
namespace CUDA {
@ -100,6 +103,8 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber);
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
NVTX_PUSH("cuda_conv_weight");
// Reorder weight
{
auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
@ -121,9 +126,13 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
}
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
}
NVTX_POP();
// Copy Bias
int biasSize = conv->bias()->size();
NVTX_PUSH("cuda_conv_bias");
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
@ -159,6 +168,7 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
} else {
FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4);
}
NVTX_POP();
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
@ -219,6 +229,7 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
mIm2ColParamter.srcYStep = input->width() * UNIT;
mIm2ColParamter.packCUnit = UNIT;
// hostToDevice
runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice);
//MNN_PRINT("conv size:%d-%d, %d-%d-%d, %d-%d-%d\n", mIm2ColParamter.kernelX, mIm2ColParamter.strideX, input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
@ -267,7 +278,7 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)hPack);
mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second);
pool->free(buffer);
return NO_ERROR;
}
@ -289,6 +300,9 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second);
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
// MNN_PRINT("onExecute bytes is:%d mUseEPack:%d, mResource->mUseHPack:%d, useFp16:%d, mBlockNum:%d,3 dimPack:{ePack:%d, lPack:%d, hPack:%d}, mMatMulParam.elh:{%d,%d,%d}, elhMultiPack:{%d,%d,%d}\n",
// bytes, mUseEPack, mResource->mUseHPack, static_cast<CUDABackend*>(backend())->useFp16(), mBlockNum, ePack, MATMULPACK, hPack, mMatMulParam.elh[0], mMatMulParam.elh[1], mMatMulParam.elh[2], mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]);
// Im2col in Block
for(int block_idx = 0; block_idx < mBlockNum; block_idx++) {
if(mUseEPack) {
@ -307,9 +321,11 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
return NO_ERROR;
}
// #define USE_MNN_CONV
class CUDAConvolutionCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
auto quan = op->main_as_Convolution2D()->quanParameter();
@ -321,19 +337,24 @@ public:
}
}
#ifdef USE_MNN_CONV
std::shared_ptr<ConvSingleInputExecution::Resource> resource(new ConvSingleInputExecution::Resource(backend, op));
return new ConvSingleInputExecution(backend, op, resource);
#else
auto conv = op->main_as_Convolution2D()->common();
if(ConvWinogradExecution::isValid(op->main_as_Convolution2D(), inputs[0])) {
if(ConvWinogradExecution::isValid(op->main_as_Convolution2D())) { // inputs[0] is invalid now.
//printf("%dx%ds%dd%d\n", conv->kernelX(), conv->kernelY(), conv->strideX(), conv->dilateX());
std::shared_ptr<ConvWinogradExecution::Resource> resource(new ConvWinogradExecution::Resource(backend, op));
return new ConvWinogradExecution(backend, op, resource);
}
// std::shared_ptr<ConvSingleInputExecution::Resource> resource(new ConvSingleInputExecution::Resource(backend, op));
// return new ConvSingleInputExecution(backend, op, resource);
std::shared_ptr<ConvCutlassExecution::Resource> resource(new ConvCutlassExecution::Resource(backend, op));
return new ConvCutlassExecution(backend, op, resource);
#endif
}
};

View File

@ -15,24 +15,26 @@ namespace CUDA {
#define UNIT 2
__global__ void WinoWeightReorder(const float* GgGt,
half* GgGt_trans,
const int outside,
const int block,
const int co_pack,
const int ci_pack,
const int unitCi,
const int unitCo
) {
const int maxCount = outside * unitCi * unitCo;
const int maxCount = block * co_pack * ci_pack;
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
size_t outside_idx = index / (unitCi*unitCo);
size_t tmp = index % (unitCi*unitCo);
size_t ci_idx = tmp / unitCo;
size_t co_idx = tmp % unitCo;
size_t tmp = index / ci_pack;
size_t ci_idx = index % ci_pack;
// [4x4, Cop, Cip, unitCi, unitCo] -->> [4x4, Cop, Cip, unitCo, unitCi]
size_t dst_idx = outside_idx * (unitCi*unitCo) + co_idx * unitCi + ci_idx;
*(GgGt_trans + dst_idx) = *(GgGt + index);
size_t block_idx = tmp / co_pack;
size_t co_idx = tmp % co_pack;
// [4x4, Cop, Cip, unitCi, unitCo] -->> [4x4, Cop*unitCo, Cip*unitCi]
size_t src_idx = block_idx * (co_pack*ci_pack) + (co_idx/unitCo) * (ci_pack*unitCo) + (ci_idx/unitCi) * (unitCi*unitCo) + (ci_idx%unitCi) * unitCo + (co_idx%unitCo);
*(GgGt_trans + index) = *(GgGt + src_idx);
}
}
bool ConvWinogradExecution::isValid(const Convolution2D* conv, const Tensor* input) {
bool ConvWinogradExecution::isValid(const Convolution2D* conv) {
//return false;
if(conv->common()->strideX() != 1 || conv->common()->strideY() != 1) {
return false;
@ -74,7 +76,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
std::shared_ptr<Tensor> srcWeight(Tensor::create<float>({mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX},
(void *)filterDataPtr, Tensor::CAFFE));
auto dstWeight = generator.allocTransformWeight(srcWeight.get(), MATMULPACK, MATMULPACK);
auto dstWeight = generator.allocTransformWeight(srcWeight.get(), PACK_NUMBER, PACK_NUMBER);
generator.transformWeight(dstWeight.get(), srcWeight.get());
auto dstWeightSize = dstWeight->elementSize();
@ -90,18 +92,18 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
int cores = prop.multiProcessorCount;
int threadNumbers = prop.maxThreadsPerBlock;
int coDiv = UP_DIV(mKernelInfo.kernelN, MATMULPACK);
int ciDiv = UP_DIV(mKernelInfo.kernelC, MATMULPACK);
int coPack = UP_DIV(mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER;
int ciPack = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER;
WinoWeightReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
(UNIT+kernel-1) * (UNIT+kernel-1) * coDiv * ciDiv, MATMULPACK, MATMULPACK);
(UNIT+kernel-1) * (UNIT+kernel-1), coPack, ciPack, PACK_NUMBER, PACK_NUMBER);
static_cast<CUDABackend*>(backend)->getStaticBufferPool()->free(tempCacheBuffer);
}
// Copy Bias
int biasSize = conv->bias()->size();
int alignSize = UP_DIV(biasSize, MATMULPACK) * MATMULPACK;
int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER;
biasTensor.reset(Tensor::createDevice<uint32_t>({alignSize}));
backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
@ -148,23 +150,23 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
mPadX = std::get<0>(pads);
mPadY = std::get<1>(pads);
int ic = input->channel();
int icDiv = UP_DIV(ic, MATMULPACK);
int icDiv = UP_DIV(ic, PACK_NUMBER);
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
auto wUnit = UP_DIV(output->width(), UNIT);
auto hUnit = UP_DIV(output->height(), UNIT);
int e = wUnit * hUnit * output->batch();
int l = icDiv * MATMULPACK;
int l = ic;
int h = output->channel();
mMatMulParam.elh[0] = e;
mMatMulParam.elh[1] = l;
mMatMulParam.elh[2] = h;
int ePack = MATMULPACK;
int hPack = MATMULPACK;
int ePack = PACK_NUMBER;
int hPack = PACK_NUMBER;
mMatMulParam.elhPack[0] = UP_DIV(e, ePack);
mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK);
mMatMulParam.elhPack[1] = UP_DIV(l, PACK_NUMBER);
mMatMulParam.elhPack[2] = UP_DIV(h, hPack);
// mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h;
// mMatMulParam.cStride[1] = 1;
@ -185,14 +187,87 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
int block = UNIT + convCommon->kernelY() - 1;
mBlock2 = block * block;
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
auto buffer = pool->alloc((size_t)sizeof(__half) * mBlock2 * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)MATMULPACK);
mBtdB_Buffer = (__half*)((uint8_t*)buffer.first + buffer.second);
auto bufferData = pool->alloc((size_t)sizeof(__half) * mBlock2 * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)PACK_NUMBER);
mBtdB_Buffer = (__half*)((uint8_t*)bufferData.first + bufferData.second);
auto buffer2 = pool->alloc(bytes * mBlock2 * mMatMulParam.elh[0] * mMatMulParam.elhPack[2] * (size_t)hPack);
mMatmul_Buffer = (void*)((uint8_t*)buffer2.first + buffer2.second);
auto bufferMatmul = pool->alloc(bytes * mBlock2 * mMatMulParam.elh[0] * mMatMulParam.elhPack[2] * (size_t)hPack);
mMatmul_Buffer = (void*)((uint8_t*)bufferMatmul.first + bufferMatmul.second);
pool->free(buffer);
pool->free(buffer2);
pool->free(bufferData);
pool->free(bufferMatmul);
mGemmInfo.elh[0] = e;
mGemmInfo.elh[1] = l;
mGemmInfo.elh[2] = h;
mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8;
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
// Split K dimension into 1 partitions
cutlass::gemm::GemmCoord problem_size(mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);// m n k
//MNN_PRINT("Winograd BatchGemm batch:%d, MNK:%d-%d-%d\n", mBlock2, mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);
if(bytes == 2) {
typename GemmBatched_F16_Linear_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInputA *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
{(ElementInputB *)mResource->mFilter, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elhPad[2]), // batch_stride_B
{(ElementOutput_F16 *)mResource->mBias, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F16 *)mMatmul_Buffer, mGemmInfo.elhPad[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBlock2}; // batch_count
size_t workspace_size = GemmBatched_F16_Linear_Sm75::get_workspace_size(arguments);
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF16LnSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF16LnSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
} else {
typename GemmBatched_F32_Linear_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
{(ElementInputA *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
{(ElementInputB *)mResource->mFilter, mGemmInfo.elhPad[1]}, // Ptr + ldm
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elhPad[2]), // batch_stride_B
{(ElementOutput_F32 *)mResource->mBias, 0}, // Ptr + ldm if ldm = 0, vector,
(int64_t)(0), // batch_stride_bias
{(ElementOutput_F32 *)mMatmul_Buffer, mGemmInfo.elhPad[2]}, // Ptr + ldm
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C
{alpha, beta}, // <- tuple of alpha and beta
mBlock2}; // batch_count
size_t workspace_size = GemmBatched_F32_Linear_Sm75::get_workspace_size(arguments);
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
pool->free(bufferWs);
// Check the problem size is supported or not
cutlass::Status status = mGemmBatchedF32LnSm75.can_implement(arguments);
cutlass_check(status);
// Initialize CUTLASS kernel with arguments and workspace pointer
status = mGemmBatchedF32LnSm75.initialize(arguments, (uint8_t *)mWorkspace);
cutlass_check(status);
}
return NO_ERROR;
}
@ -205,8 +280,8 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
int threadNumbers = prop.maxThreadsPerBlock / 2;
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
int coDiv = UP_DIV(mResource->mKernelInfo.kernelN, MATMULPACK);
int ciDiv = UP_DIV(mResource->mKernelInfo.kernelC, MATMULPACK);
int co_pack = UP_DIV(mResource->mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER;
int ci_pack = UP_DIV(mResource->mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER;
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
const void *input_addr = (const void*)input->deviceId();
@ -215,16 +290,23 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
void *output_addr = (void*)output->deviceId();
const int kernel = 3;
const int wUnit = UP_DIV(input->width(), UNIT);
const int hUnit = UP_DIV(input->height(), UNIT);
DivModFast lD(ci_pack);
DivModFast hD(co_pack);
DivModFast whD(wUnit * hUnit);
DivModFast wD(wUnit);
if(bytes == 4) {
WinoInputTrans<<<cores, threadNumbers>>>((const float*)input_addr, (half*)mBtdB_Buffer, UNIT,
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ciDiv, output->batch(), UP_DIV(input->width(), UNIT),
UP_DIV(input->height(), UNIT), MATMULPACK, MATMULPACK,
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
mMatMulParam.elh[0] * ci_pack, lD, whD, wD,
mPadX, mPadY, input->width(), input->height());
checkKernelErrors;
} else {
WinoInputTrans<<<cores, threadNumbers>>>((const half*)input_addr, (half*)mBtdB_Buffer, UNIT,
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ciDiv, output->batch(), UP_DIV(input->width(), UNIT),
UP_DIV(input->height(), UNIT), MATMULPACK, MATMULPACK,
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
mMatMulParam.elh[0] * ci_pack, lD, whD, wD,
mPadX, mPadY, input->width(), input->height());
checkKernelErrors;
}
@ -235,41 +317,27 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
int iBlock = 0;
if (4 == bytes) {
cudaFuncSetAttribute(GemmPackedMulti<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
GemmPackedMulti<<<cores, threads_num, basicMemory>>>(gpuMatMul, iBlock, mBlock2, (float*)mMatmul_Buffer, mBtdB_Buffer, (const __half*)mGgGt_Buffer);
checkKernelErrors;
cutlass::Status status = mGemmBatchedF32LnSm75();
cutlass_check(status);
} else {
//MNN_PRINT("%d - %d, %d- %d\n", cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elh[0], cpuParam->elh[2]);
cudaFuncSetAttribute(GemmPackedMulti<half>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
GemmPackedMulti<<<cores, threads_num, basicMemory>>>(gpuMatMul, iBlock, mBlock2, (half*)mMatmul_Buffer, mBtdB_Buffer, (const __half*)mGgGt_Buffer);
checkKernelErrors;
cutlass::Status status = mGemmBatchedF16LnSm75();
cutlass_check(status);
}
if (4 == bytes) {
WinoTrans2Output<<<cores, threadNumbers>>>((const float*)mMatmul_Buffer, (const float*)bias_addr, (float*)output_addr,
gpuMatMul, UNIT,
mBlock2, output->channel(), ciDiv, output->batch(), UP_DIV(output->width(), UNIT),
UP_DIV(output->height(), UNIT), MATMULPACK, MATMULPACK,
gpuMatMul, UNIT, mBlock2, output->channel(), co_pack,
mMatMulParam.elh[0] * co_pack, hD, whD, wD,
output->width(), output->height());
checkKernelErrors;
} else {
WinoTrans2Output<<<cores, threadNumbers>>>((const half*)mMatmul_Buffer, (const float*)bias_addr, (half*)output_addr,
gpuMatMul, UNIT,
mBlock2, output->channel(), ciDiv, output->batch(), UP_DIV(output->width(), UNIT),
UP_DIV(output->height(), UNIT), MATMULPACK, MATMULPACK,
gpuMatMul, UNIT, mBlock2, output->channel(), co_pack,
mMatMulParam.elh[0] * co_pack, hD, whD, wD,
output->width(), output->height());
checkKernelErrors;
}
// if(output->width() == 56 && output->channel() == 64 && input->channel() == 64) {
// cudaDeviceSynchronize();
// float bias_[mMatMulParam.elhPack[2] * 16];
// runtime->memcpy((void*)bias_, bias_addr, mMatMulParam.elhPack[2] * 16*sizeof(float), MNNMemcpyDeviceToHost);
// for(int i=0; i<mMatMulParam.elhPack[2] * 16; i++) {
// printf("%d-%f\n", i, bias_[i]);
// }
// }
return NO_ERROR;
}

View File

@ -11,6 +11,9 @@
#include "ConvSingleInputExecution.hpp"
#include "TensorCoreGemmPacked.cuh"
#include "CutlassGemmParam.hpp"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
@ -18,7 +21,7 @@ namespace CUDA {
class ConvWinogradExecution : public Execution {
public:
struct Resource;
static bool isValid(const Convolution2D* conv, const Tensor* input);
static bool isValid(const Convolution2D* conv);
ConvWinogradExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
virtual ~ConvWinogradExecution();
@ -45,6 +48,13 @@ private:
void* mMatmul_Buffer;
MatMulParam mMatMulParam;
std::pair<void*, int> mGpuMatMulParam;
GemmBatched_F16_Linear_Sm75 mGemmBatchedF16LnSm75;
GemmBatched_F32_Linear_Sm75 mGemmBatchedF32LnSm75;
std::shared_ptr<Tensor> workspaceTensor;
uint8_t* mWorkspace;
CutlassGemmInfo mGemmInfo;
int mPadX;
int mPadY;

View File

@ -11,10 +11,17 @@
#include "cutlass/epilogue/thread/linear_combination_relu.h"
#include "cutlass/epilogue/thread/linear_combination_relu6.h"
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/gemm/device/gemm_array.h"
#include "cutlass/gemm/device/gemm_batched.h"
namespace MNN {
namespace CUDA {
struct CutlassGemmInfo{
int elh[3];
int elhPad[3];
};
using ElementAccumulator = float; // <- data type of accumulator
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
using ElementInputA = cutlass::half_t; // <- data type of elements in input matrix A
@ -37,9 +44,9 @@ using SmArch75 = cutlass::arch::Sm75;
// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
cutlass::gemm::GemmShape<64, 64, 32>; // <- threadblock tile M = 128, N = 256, K = 64
cutlass::gemm::GemmShape<64, 64, 64>; // <- threadblock tile M = 128, N = 256, K = 64
// This code section describes tile size a warp will compute
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 32, 32>; // <- warp tile M = 64, N = 64, K = 64
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 32, 64>; // <- warp tile M = 64, N = 64, K = 64
// This code section describes the size of MMA op
using ShapeMMAOp1688 = cutlass::gemm::GemmShape<16, 8, 8>; // <- MMA Op tile M = 8, N = 8, K = 16
using ShapeMMAOp884 = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 16
@ -300,6 +307,45 @@ using Gemm_F32_Relu6_Sm75 = cutlass::gemm::device::Gemm<ElementInputA,
SwizzleThreadBlock,
NumStages>;
// This code section describes how threadblocks are scheduled on GPU
using BatchedSwizzleThreadBlock = cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle; // <- ??
using ShapeBatchMMAThreadBlock =
cutlass::gemm::GemmShape<64, 64, 64>; // <- threadblock tile M = 128, N = 256, K = 64
// This code section describes tile size a warp will compute
using ShapeBatchMMAWarp = cutlass::gemm::GemmShape<16, 64, 64>; // <- warp tile M = 64, N = 64, K = 64
using GemmBatched_F16_Linear_Sm75 = cutlass::gemm::device::GemmBatched<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput_F16,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch75,
ShapeBatchMMAThreadBlock,
ShapeBatchMMAWarp,
ShapeMMAOp1688,
EpilogueOp_F16_Linear,
BatchedSwizzleThreadBlock,
NumStages>;
using GemmBatched_F32_Linear_Sm75 = cutlass::gemm::device::GemmBatched<ElementInputA,
LayoutInputA,
ElementInputB,
LayoutInputB,
ElementOutput_F32,
LayoutOutput,
ElementAccumulator,
MMAOp,
SmArch75,
ShapeBatchMMAThreadBlock,
ShapeBatchMMAWarp,
ShapeMMAOp1688,
EpilogueOp_F32_Linear,
BatchedSwizzleThreadBlock,
NumStages>;
} // namespace CUDA
} // namespace MNN

View File

@ -536,7 +536,7 @@ __global__ void Binary##Name(\
int sizeZ, int sizeY, int sizeX,\
int strideZ, int strideY, int strideX,\
int strideZ1, int strideY1, int strideX1,\
int dstStrideZ, int dstStrideY, int dstStrideX\
int dstStrideZ, int dstStrideY, int dstStrideX, int activationType\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
@ -550,7 +550,11 @@ __global__ void Binary##Name(\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
TIn x = input0[srcOffset];\
TIn y = input1[srcOffset1];\
output[dstOffset] = (TOut)Func;\
TOut val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset] = val;\
}\
}\
@ -561,7 +565,7 @@ __global__ void BinaryMid##Name(\
int sizeZ, int sizeY, int sizeX,\
int strideZ, int strideY, int strideX,\
int strideZ1, int strideY1, int strideX1,\
int dstStrideZ, int dstStrideY, int dstStrideX\
int dstStrideZ, int dstStrideY, int dstStrideX, int activationType\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
@ -575,7 +579,11 @@ __global__ void BinaryMid##Name(\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
float x = input0[srcOffset];\
float y = input1[srcOffset1];\
output[dstOffset] = (TOut)(Func);\
TOut val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset] = val;\
}\
}\
template<typename TIn, typename TOut>\
@ -584,7 +592,8 @@ __global__ void BinaryMidLinear##Name(\
int sizeZ,\
int strideZ,\
int strideZ1,\
int dstStrideZ\
int dstStrideZ,\
int activationType\
) { \
int count = sizeZ;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
@ -594,7 +603,11 @@ __global__ void BinaryMidLinear##Name(\
int dstOffset = iz * dstStrideZ;\
float x = input0[srcOffset];\
float y = input1[srcOffset1];\
output[dstOffset] = (TOut)(Func);\
TOut val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset] = val;\
}\
}\
@ -602,7 +615,7 @@ __global__ void BinaryMidLinear##Name(\
template<typename TIn, typename TOut>\
__global__ void BinaryMidLinear4_##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int count_4\
int count_4, int activationType\
) { \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
int iz = i;\
@ -613,22 +626,38 @@ __global__ void BinaryMidLinear4_##Name(\
float4 yy = ((float4 *)(input1+srcOffset1))[0];\
float x = xx.x;\
float y = yy.x;\
output[dstOffset] = (TOut)(Func);\
TOut val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset] = val;\
x = xx.y;\
y = yy.y;\
output[dstOffset+1] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+1] = val;\
x = xx.z;\
y = yy.z;\
output[dstOffset+2] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+2] = val;\
x = xx.w;\
y = yy.w;\
output[dstOffset+3] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+3] = val;\
}\
}\
template<typename TIn, typename TOut>\
__global__ void BinaryMidLinearHalf4_##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int count_4\
int count_4, int activationType\
) { \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
int iz = i;\
@ -639,18 +668,34 @@ __global__ void BinaryMidLinearHalf4_##Name(\
half2 yy = ((half2 *)(input1+srcOffset1))[0];\
float x = (float)xx.x;\
float y = (float)yy.x;\
output[dstOffset] = (TOut)(Func);\
TOut val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset] = val;\
x = (float)xx.y;\
y = (float)yy.y;\
output[dstOffset+1] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+1] = val;\
xx = ((half2 *)(input0+srcOffset))[1];\
yy = ((half2 *)(input1+srcOffset1))[1];\
x = (float)xx.x;\
y = (float)yy.x;\
output[dstOffset+2] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+2] = val;\
x = (float)xx.y;\
y = (float)yy.y;\
output[dstOffset+3] = (TOut)(Func);\
val = (TOut)(Func);\
if(activationType == 1) {\
val = (val < (TOut)0 ? (TOut)0 : val);\
}\
output[dstOffset+3] = val;\
}\
}\
@ -720,7 +765,7 @@ BINARY_FUNC_FLOATMID4(MOD, fmod(x, y));
BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0);
template<typename T>
void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType, int activationType) {
int count = size[0] * size[1] * size[2];
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
@ -732,24 +777,25 @@ void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const i
threads_num = runtime->threads_num();\
if(bytes == 4) {\
BinaryMidLinear4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
count/4);\
count/4, activationType);\
} else {\
BinaryMidLinearHalf4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
count/4);\
count/4, activationType);\
}\
} else {\
BinaryMidLinear##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
size[2],\
srcStride[2],\
srcStride1[2],\
dstStride[2]);\
dstStride[2],\
activationType);\
}\
} else {\
BinaryMid##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
size[0], size[1], size[2],\
srcStride[0], srcStride[1], srcStride[2],\
srcStride1[0], srcStride1[1], srcStride1[2],\
dstStride[0], dstStride[1], dstStride[2]);\
dstStride[0], dstStride[1], dstStride[2], activationType);\
}\
return;\
}\
@ -777,7 +823,7 @@ void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const i
#undef COMPUTE_FLOAT
}
void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType, int activationType) {
int count = size[0] * size[1] * size[2];
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
@ -787,7 +833,7 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
size[0], size[1], size[2],\
srcStride[0], srcStride[1], srcStride[2],\
srcStride1[0], srcStride1[1], srcStride1[2],\
dstStride[0], dstStride[1], dstStride[2]);\
dstStride[0], dstStride[1], dstStride[2], activationType);\
return;\
}\
@ -809,15 +855,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
}
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) {
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType, int activationType) {
if (type.code == halide_type_float) {
if (type.bits == 32) {
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
} else if (type.bits == 16) {
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
}
} else if (type.code == halide_type_int) {
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
}
}

View File

@ -8,7 +8,7 @@ namespace CUDA {
void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime, int offsetunit);
void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime);
void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType);
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType);
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType, int activationType = 0);
// Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst
struct FuseRegion {

View File

@ -145,6 +145,59 @@ static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Te
return 1;
}
static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
if (slice0.src.stride[0] != slice1.src.stride[0] || slice0.dst.stride[0] != slice1.dst.stride[0]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]);
return false;
}
if (slice0.src.stride[1] != slice1.src.stride[1] || slice0.dst.stride[1] != slice1.dst.stride[1]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride1:%d-%d, , dst stride1:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[1], slice0.src.stride[1], slice.dst.stride[1], slice0.dst.stride[1]);
return false;
}
if (slice0.src.stride[2] != slice1.src.stride[2] || slice0.dst.stride[2] != slice1.dst.stride[2]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride2:%d-%d, , dst stride2:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[2], slice0.src.stride[2], slice.dst.stride[2], slice0.dst.stride[2]);
return false;
}
if (slice0.size[0] != slice1.size[0] || slice0.size[1] != slice1.size[1] || slice0.size[2] != slice1.size[2]) {
//MNN_PRINT("Raster total:%d, index:%d, copy size:%d-%d-%d, %d-%d-%d\n", mTempInputCopy.size(), i, slice.size[0], slice.size[1], slice.size[2], slice0.size[0], slice0.size[1], slice0.size[2]);
return false;
}
return true;
}
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
if(slice0.size[1] % PACK_NUMBER != 0 || slice0.size[0] != 1) {
return false;
}
if(slice1.size[1] % PACK_NUMBER != 0 || slice1.size[0] != 1) {
return false;
}
if(slice0.dst.offset % (slice0.size[1] * slice0.size[0]) != 0) {
return false;
}
if(slice1.dst.offset % (slice1.size[1] * slice1.size[0]) != 0) {
return false;
}
return _equalSizeStride(slice0, slice1);
}
static void _turnToNewRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& newRegion, int multiStride) {
newRegion.size[0] = region.size[0];
newRegion.size[1] = region.size[2];
newRegion.size[2] = region.size[1];
newRegion.src.stride[0] = region.src.stride[0];
newRegion.src.stride[1] = region.src.stride[2] * region.size[1];
newRegion.src.stride[2] = region.src.stride[1] / region.size[2];
newRegion.dst.stride[0] = region.dst.stride[0] * multiStride;
newRegion.dst.stride[1] = region.dst.stride[2] * region.size[1] * multiStride;
newRegion.dst.stride[2] = region.dst.stride[1] / region.size[2];
newRegion.src.offset = region.src.offset / region.size[2];
newRegion.dst.offset = region.dst.offset / region.size[2];
}
ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
@ -157,7 +210,52 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
mTempInput.clear();
mTempOutput = nullptr;
mOutputPtr = output;
mOutputPtr = output;
mFast = false;
int pack = PACK_NUMBER;
// all_srcFormat == dstFormat == NC4HW4 : Fast Exe
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
mFast = true;
auto& slice0 = des->regions[0];
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
//MNN_PRINT("%d-%d-%d, %d-%d-%d-%d\n", slice.size[0], slice.size[1], slice.size[2], slice.src.stride[1], slice.src.stride[2], slice.dst.stride[1], slice.dst.stride[2]);
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
mFast = false;
break;
}
if(!_directBlitC4(slice0, slice)) {
mFast = false;
break;
}
if (!OpCommonUtils::canBlitFast(slice, output, pack, false, true)) {
mFast = false;
break;
}
}
//MNN_PRINT("raster fast:%d\n", mFast);
if (mFast) {
int multiStride = 1;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if(slice.dst.offset / (slice.size[0] * slice.size[1]) >= 1) {
int batchChannel = slice.dst.offset / (slice.size[1] * slice.size[2]) + 1;
multiStride = multiStride > batchChannel ? multiStride : batchChannel;
}
}
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (slice.origin == nullptr) {
continue;
}
Tensor::InsideDescribe::Region newRegion;
_turnToNewRegion(slice, newRegion, multiStride);
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
}
return NO_ERROR;
}
}
mSingleConvert = 0;
// srcNum == 1 && srcFormat != dstFormat : Single Convert
@ -228,25 +326,8 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
//MNN_PRINT("Raster total:%d, index:%d, origin:%p-%p\n", mTempInputCopy.size(), i, mTempInputCopy[i].first, mTempInputCopy[0].first);
break;
}
if (slice0.src.stride[0] != slice.src.stride[0] || slice0.dst.stride[0] != slice.dst.stride[0]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]);
if(!_equalSizeStride(slice0, slice)) {
mFuseRaster.first = 0;
break;
}
if (slice0.src.stride[1] != slice.src.stride[1] || slice0.dst.stride[1] != slice.dst.stride[1]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride1:%d-%d, , dst stride1:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[1], slice0.src.stride[1], slice.dst.stride[1], slice0.dst.stride[1]);
mFuseRaster.first = 0;
break;
}
if (slice0.src.stride[2] != slice.src.stride[2] || slice0.dst.stride[2] != slice.dst.stride[2]) {
//MNN_PRINT("Raster total:%d, index:%d, src stride2:%d-%d, , dst stride2:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[2], slice0.src.stride[2], slice.dst.stride[2], slice0.dst.stride[2]);
mFuseRaster.first = 0;
break;
}
if (slice0.size[0] != slice.size[0] || slice0.size[1] != slice.size[1] || slice0.size[2] != slice.size[2]) {
//MNN_PRINT("Raster total:%d, index:%d, copy size:%d-%d-%d, %d-%d-%d\n", mTempInputCopy.size(), i, slice.size[0], slice.size[1], slice.size[2], slice0.size[0], slice0.size[1], slice0.size[2]);
mFuseRaster.first = 0;
break;
}
}
}
@ -284,7 +365,29 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
return NO_ERROR;
}
void RasterExecution::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
auto bn = static_cast<CUDABackend*>(backend());
auto input = inputs[0];
auto output = outputs[0];
auto bytes = bn->getBytes(output);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
if (mNeedZero) {
auto size = static_cast<CUDABackend*>(backend())->realSize(output) * bytes;
cudaMemset((uint8_t*)output->deviceId(), 0, size);
}
// Use mFastBlit
for (auto& iter : mFastBlit) {
auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes;
auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes;
RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes, runtime);
}
}
ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mFast) {
executeFaster(inputs, outputs);
return NO_ERROR;
}
auto bn = static_cast<CUDABackend*>(backend());
auto input = inputs[0];
auto output = outputs[0];

View File

@ -27,7 +27,7 @@ __global__ void GemmPackedFull(const MatMulParam* param, const int iBlock, T *c,
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, T> acc_frag;
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
size_t subIndex = index / warpSize;
size_t warpM = subIndex % eU;
@ -39,9 +39,6 @@ __global__ void GemmPackedFull(const MatMulParam* param, const int iBlock, T *c,
//printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements);
// MLA
for (size_t i = 0; i < lU; ++i) {
half* aTemp = ((half *)(aStart+i*256));//aStart + (i << 8) + (laneId << 1);
half* bTemp = ((half *)(bStart+i*256));//bStart + (i << 8) + (laneId << 1);
wmma::load_matrix_sync(a_frag, aStart + i * 256, 16);
wmma::load_matrix_sync(b_frag, bStart + i * 256, 16);
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
@ -99,7 +96,10 @@ __global__ void GemmPackedFull16x32(const MatMulParam* param, const int iBlock,
size_t wrapId = threadIdx.x / warpSize;
size_t laneId = threadIdx.x % warpSize;
extern __shared__ float sharedMemory[];
T* cache = (T*)(sharedMemory + wrapId * 16 * 32);
constexpr int row = 17;
constexpr int conflict_free_size = 24;
constexpr int offset = 16;
T* cache = (T*)(sharedMemory + wrapId * conflict_free_size * (16 + row));
for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) {
size_t warpM = index % eU;
size_t warpN = index / eU;
@ -143,18 +143,18 @@ __global__ void GemmPackedFull16x32(const MatMulParam* param, const int iBlock,
T* dst0 = (T*)(c + warpN * 32 * (size_t)param->elh[0] + eSta * 16);
T* dst1 = (T*)(c + (warpN * 32 + 16) * (size_t)param->elh[0] + eSta * 16);
// First 8x32
wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major);
wmma::store_matrix_sync(cache, MC0, conflict_free_size, wmma::mem_row_major);
// Second 8x32
wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major);
wmma::store_matrix_sync(cache + 16 * conflict_free_size + offset, MC1, conflict_free_size, wmma::mem_row_major);
auto dst = dst0;
auto src = cache;
if (laneId >= 16) {
dst = dst1;
src = cache + 256;
src = cache + 16 * conflict_free_size + offset;
}
size_t x = laneId % 16;
for (size_t y = 0; y < eC; ++y) {
dst[y * 16 + x] = src[y * 16 + x];
dst[y * 16 + x] = src[y * conflict_free_size + x];
}
}
}
@ -169,7 +169,10 @@ __global__ void GemmPackedFull32x16(const MatMulParam* param, const int iBlock,
size_t wrapId = threadIdx.x / warpSize;
size_t laneId = threadIdx.x % warpSize;
extern __shared__ float sharedMemory[];
T* cache = (T*)(sharedMemory + wrapId * 32 * 16);
constexpr int row = 17;
constexpr int conflict_free_size = 24;
constexpr int offset = 16;
T* cache = (T*)(sharedMemory + wrapId * (16 + row) * conflict_free_size);
for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) {
size_t warpN = index % hU;
size_t warpM = index / hU;
@ -216,18 +219,18 @@ __global__ void GemmPackedFull32x16(const MatMulParam* param, const int iBlock,
T* dst0 = (T*)(c + warpN * 16 * (size_t)param->elh[0] + eSta * 16);
T* dst1 = (T*)(dst0 + 256);
// First 8x32
wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major);
wmma::store_matrix_sync(cache, MC0, conflict_free_size, wmma::mem_row_major);
// Second 8x32
wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major);
wmma::store_matrix_sync(cache + 16 * conflict_free_size + offset, MC1, conflict_free_size, wmma::mem_row_major);
auto dst = dst0;
auto src = cache;
if (laneId >= 16) {
dst = dst1;
src = cache + 256;
src = cache + 16 * conflict_free_size + offset;
}
size_t x = laneId % 16;
for (size_t y = 0; y < eC; ++y) {
dst[y * 16 + x] = src[y * 16 + x];
dst[y * 16 + x] = src[y * conflict_free_size + x];
}
}
}
@ -261,10 +264,14 @@ void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const Ma
{
int hUP = cpuParam->elhPack[2];
int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * hUP, cores);
int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize);
constexpr int max_threadblock = 512;
int threads_num = ALIMIN(max_threadblock, maxThreadInWarp * prop.warpSize);
//MNN_PRINT("GemmPacked16x32%d-%d-%d-%d-%d\n\n", hUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num);
threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num);
int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize);
constexpr int row = 17;
constexpr int conflict_free_size = 24;
int basicMemory = (16 + row) * conflict_free_size * sizeof(float) * (threads_num / prop.warpSize);
// MNN_PRINT("GemmPacked16x32 basicMemory byte size:%d\n", basicMemory);
if (4 == bytes) {
cudaFuncSetAttribute(GemmPackedFull16x32<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
GemmPackedFull16x32<<<cores, threads_num, basicMemory>>>(param, iBlock, (float*)c, a, b, (float*)biasPtr);
@ -284,10 +291,13 @@ void GemmPacked32x16(CUDARuntime* runtime, const MatMulParam* cpuParam, const Ma
{
int eUP = cpuParam->elhPack[0];
int maxThreadInWarp = UP_DIV(eUP * cpuParam->elhPack[2], cores);
int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize);
constexpr int max_threadblock = 512;
int threads_num = ALIMIN(max_threadblock, maxThreadInWarp * prop.warpSize);
//MNN_PRINT("GemmPacked32x16%d-%d-%d-%d-%d\n\n", eUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num);
threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num);
int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize);
constexpr int row = 17;
constexpr int conflict_free_size = 24;
int basicMemory = (16 + row) * conflict_free_size * sizeof(float) * (threads_num / prop.warpSize);
if (4 == bytes) {
cudaFuncSetAttribute(GemmPackedFull32x16<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
GemmPackedFull32x16<<<cores, threads_num, basicMemory>>>(param, iBlock, (float*)c, a, b, (float*)biasPtr);

View File

@ -292,19 +292,65 @@ __global__ void NCHW_2_NHWC8(const T0* input,
const int maxCount,
const int channel,
const int area,
const int channel_pack
const int channel_pack,
DivModFast d_ocp,
DivModFast d_area
) {
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int chnlp_idx = index % channel_pack;
int temp = index / channel_pack;
int area_idx = temp % area;
int batch_idx = temp / area;
int area_idx, temp, chnlp_idx, batch_idx;
d_ocp.divmod(index, temp, chnlp_idx);
d_area.divmod(temp, batch_idx, area_idx);
if(chnlp_idx >= channel) {
output[index] = (T1)0.0f;
continue;
}
output[index] = (T1)input[(batch_idx * channel + chnlp_idx) * area + area_idx];
int src_offset = (batch_idx * channel + chnlp_idx) * area + area_idx;
output[index] = (T1)input[src_offset];
}
}
template<typename T0, typename T1>
__global__ void NCHW_2_NHWC(const T0* input,
T1* output,
const int maxCount,
const int channel,
const int area,
const int channel_pack,
DivModFast d_oc,
DivModFast d_area
) {
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int area_idx, temp, chnl_idx, batch_idx;
d_oc.divmod(index, temp, chnl_idx);
d_area.divmod(temp, batch_idx, area_idx);
int src_offset = (batch_idx * channel + chnl_idx) * area + area_idx;
output[index] = (T1)input[src_offset];
}
}
template<typename T0, typename T1>
__global__ void NHWC_2_NHWC8(const T0* input,
T1* output,
const int maxCount,
const int channel,
const int area,
const int channel_pack,
DivModFast d_ocp,
DivModFast d_area
) {
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int area_idx, temp, chnlp_idx, batch_idx;
d_ocp.divmod(index, temp, chnlp_idx);
d_area.divmod(temp, batch_idx, area_idx);
if(chnlp_idx >= channel) {
output[index] = (T1)0.0f;
continue;
}
int src_offset = (batch_idx * area + area_idx) * channel + chnlp_idx;
output[index] = (T1)input[src_offset];
}
}
@ -314,17 +360,40 @@ __global__ void NHWC8_2_NCHW(const T0* input,
const int maxCount,
const int channel,
const int area,
const int channel_pack
const int channel_pack,
DivModFast d_oc,
DivModFast d_area
) {
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int area_idx = index % area;
int temp = index / area;
int channel_idx = temp % channel;
int batch_idx = temp / channel;
output[index] = (T1)input[(batch_idx * area + area_idx) * channel_pack + channel_idx];
int area_idx, temp, channel_idx, batch_idx;
d_area.divmod(index, temp, area_idx);
d_oc.divmod(temp, batch_idx, channel_idx);
int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
output[index] = (T1)input[src_offset];
}
}
template<typename T0, typename T1>
__global__ void NHWC8_2_NHWC(const T0* input,
T1* output,
const int maxCount,
const int channel,
const int area,
const int channel_pack,
DivModFast d_oc,
DivModFast d_area
) {
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int area_idx, temp, channel_idx, batch_idx;
d_oc.divmod(index, temp, channel_idx);
d_area.divmod(temp, batch_idx, area_idx);
int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
output[index] = (T1)input[src_offset];
}
}
template<typename T0, typename T1>
@ -383,11 +452,25 @@ __global__ void NHWC8_2_C4NHW4(const T0* input,
template<class T0, class T1>
static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
const int area, const int batch, const int channel) {
DivModFast d_oc(channel);
DivModFast d_ocp(UP_DIV(channel, 8) * 8);
DivModFast d_area(area);
if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
const int block_num = runtime->blocks_num(maxCount);
const int block_size = runtime->threads_num();
NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8);
NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
d_ocp, d_area);
checkKernelErrors;
return;
}
if(srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
const int block_num = runtime->blocks_num(maxCount);
const int block_size = runtime->threads_num();
NHWC_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
d_ocp, d_area);
checkKernelErrors;
return;
}
@ -404,7 +487,26 @@ static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFo
const int maxCount = batch * area * channel;
const int block_num = runtime->blocks_num(maxCount);
const int block_size = runtime->threads_num();
NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8);
NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
d_oc, d_area);
checkKernelErrors;
return;
}
if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
const int maxCount = batch * area * channel;
const int block_num = runtime->blocks_num(maxCount);
const int block_size = runtime->threads_num();
NHWC8_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
d_oc, d_area);
checkKernelErrors;
return;
}
if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
const int maxCount = batch * area * channel;
const int block_num = runtime->blocks_num(maxCount);
const int block_size = runtime->threads_num();
NCHW_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
d_oc, d_area);
checkKernelErrors;
return;
}

View File

@ -16,45 +16,29 @@ __global__ void WinoInputTrans(const T* input,
const int unit,
const int block,
const int ci,
const int ciDiv,
const int batch,
const int wLen,
const int hLen,
const int ePack,
const int lPack,
const int ci_p8,
const int maxCount,
DivModFast lD,
DivModFast whD,
DivModFast wD,
const int pad_x,
const int pad_y,
const int width,
const int height
) {
const int eU = (batch * wLen * hLen + 15) / 16;
const int lU = ciDiv;
const int eP = eU * ePack;
const int lP = lU * lPack;
const int maxCount = eP * lP;
const int l = ci_p8;
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
const int eU_idx = index / (lP * ePack);
const int tmp1 = index % (lP * ePack);
const int lU_idx = tmp1 / (ePack * lPack);
const int tmp2 = tmp1 % (ePack * lPack);
const int ePack_idx = tmp2 / lPack;
const int lPack_idx = tmp2 % lPack;
const int e_idx = eU_idx * ePack + ePack_idx;
const int batch_idx = e_idx / (wLen * hLen);
const int tmp3 = e_idx % (wLen * hLen);
const int w_idx = tmp3 % wLen;
const int h_idx = tmp3 / wLen;
int e_idx, ci_idx, batch_idx, tmp, w_idx, h_idx;
lD.divmod(index, e_idx, ci_idx);
whD.divmod(e_idx, batch_idx, tmp);
wD.divmod(tmp, h_idx, w_idx);
const int sxStart = w_idx * unit - pad_x;
const int syStart = h_idx * unit - pad_y;
T S00, S10, S20, S30, S01, S11, S21, S31, S02, S12, S22, S32, S03, S13, S23, S33;
int ci_p8 = ((ci + 7) / 8) * 8;
int ci_idx = lU_idx * lPack + lPack_idx;
int inp_offset = ((batch_idx * height + syStart) * width + sxStart) * ci_p8 + ci_idx;
//(((lU_idx * batch + batch_idx) * height + syStart) * width + sxStart) * 16 + lPack_idx;
{
int sx = 0 + sxStart;
int sy = 0 + syStart;
@ -74,14 +58,14 @@ __global__ void WinoInputTrans(const T* input,
int sy = 0 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S20 = outBound ? (T)(0) : input[inp_offset+2*ci_p8];
S20 = outBound ? (T)(0) : input[inp_offset+ci_p8+ci_p8];
}
{
int sx = 3 + sxStart;
int sy = 0 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S30 = outBound ? (T)(0) : input[inp_offset+3*ci_p8];
S30 = outBound ? (T)(0) : input[inp_offset+ci_p8+ci_p8+ci_p8];
}
{
int sx = 0 + sxStart;
@ -116,85 +100,85 @@ __global__ void WinoInputTrans(const T* input,
int sy = 2 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S02 = outBound ? (T)(0) : input[inp_offset+(2*width+0)*ci_p8];
S02 = outBound ? (T)(0) : input[inp_offset+(width+width+0)*ci_p8];
}
{
int sx = 1 + sxStart;
int sy = 2 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S12 = outBound ? (T)(0) : input[inp_offset+(2*width+1)*ci_p8];
S12 = outBound ? (T)(0) : input[inp_offset+(width+width+1)*ci_p8];
}
{
int sx = 2 + sxStart;
int sy = 2 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S22 = outBound ? (T)(0) : input[inp_offset+(2*width+2)*ci_p8];
S22 = outBound ? (T)(0) : input[inp_offset+(width+width+2)*ci_p8];
}
{
int sx = 3 + sxStart;
int sy = 2 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S32 = outBound ? (T)(0) : input[inp_offset+(2*width+3)*ci_p8];
S32 = outBound ? (T)(0) : input[inp_offset+(width+width+3)*ci_p8];
}
{
int sx = 0 + sxStart;
int sy = 3 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S03 = outBound ? (T)(0) : input[inp_offset+(3*width+0)*ci_p8];
S03 = outBound ? (T)(0) : input[inp_offset+(width+width+width+0)*ci_p8];
}
{
int sx = 1 + sxStart;
int sy = 3 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S13 = outBound ? (T)(0) : input[inp_offset+(3*width+1)*ci_p8];
S13 = outBound ? (T)(0) : input[inp_offset+(width+width+width+1)*ci_p8];
}
{
int sx = 2 + sxStart;
int sy = 3 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S23 = outBound ? (T)(0) : input[inp_offset+(3*width+2)*ci_p8];
S23 = outBound ? (T)(0) : input[inp_offset+(width+width+width+2)*ci_p8];
}
{
int sx = 3 + sxStart;
int sy = 3 + syStart;
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
S33 = outBound ? (T)(0) : input[inp_offset+(3*width+3)*ci_p8];
S33 = outBound ? (T)(0) : input[inp_offset+(width+width+width+3)*ci_p8];
}
T m00 = +S00 - S02;
T m10 = +S10 - S12;
T m20 = +S20 - S22;
T m30 = +S30 - S32;
T m01 = +(T)0.5f * S01 + (T)0.5f * S02;
T m11 = +(T)0.5f * S11 + (T)0.5f * S12;
T m21 = +(T)0.5f * S21 + (T)0.5f * S22;
T m31 = +(T)0.5f * S31 + (T)0.5f * S32;
T m02 = -(T)0.5f * S01 + (T)0.5f * S02;
T m12 = -(T)0.5f * S11 + (T)0.5f * S12;
T m22 = -(T)0.5f * S21 + (T)0.5f * S22;
T m32 = -(T)0.5f * S31 + (T)0.5f * S32;
T m01 = +(T)0.5f * (S01 + S02);
T m11 = +(T)0.5f * (S11 + S12);
T m21 = +(T)0.5f * (S21 + S22);
T m31 = +(T)0.5f * (S31 + S32);
T m02 = +(T)0.5f * (-S01 + S02);
T m12 = +(T)0.5f * (-S11 + S12);
T m22 = +(T)0.5f * (-S21 + S22);
T m32 = +(T)0.5f * (-S31 + S32);
T m03 = -S01 + S03;
T m13 = -S11 + S13;
T m23 = -S21 + S23;
T m33 = -S31 + S33;
BtdB[0*maxCount + index] = +m00 - m20;
BtdB[1*maxCount + index] = +(T)0.5f * m10 + (T)0.5f * m20;
BtdB[2*maxCount + index] = -(T)0.5f * m10 + (T)0.5f * m20;
BtdB[1*maxCount + index] = +(T)0.5f * (m10 + m20);
BtdB[2*maxCount + index] = +(T)0.5f * (-m10 + m20);
BtdB[3*maxCount + index] = -m10 + m30;
BtdB[4*maxCount + index] = +m01 - m21;
BtdB[5*maxCount + index] = +(T)0.5f * m11 + (T)0.5f * m21;
BtdB[6*maxCount + index] = -(T)0.5f * m11 + (T)0.5f * m21;
BtdB[5*maxCount + index] = +(T)0.5f * (m11 + m21);
BtdB[6*maxCount + index] = +(T)0.5f * (-m11 + m21);
BtdB[7*maxCount + index] = -m11 + m31;
BtdB[8*maxCount + index] = +m02 - m22;
BtdB[9*maxCount + index] = +(T)0.5f * m12 + (T)0.5f * m22;
BtdB[10*maxCount + index] = -(T)0.5f * m12 + (T)0.5f * m22;
BtdB[9*maxCount + index] = +(T)0.5f * (m12 + m22);
BtdB[10*maxCount + index] = +(T)0.5f * (-m12 + m22);
BtdB[11*maxCount + index] = -m12 + m32;
BtdB[12*maxCount + index] = +m03 - m23;
BtdB[13*maxCount + index] = +(T)0.5f * m13 + (T)0.5f * m23;
BtdB[14*maxCount + index] = -(T)0.5f * m13 + (T)0.5f * m23;
BtdB[13*maxCount + index] = +(T)0.5f * (m13 + m23);
BtdB[14*maxCount + index] = +(T)0.5f * (-m13 + m23);
BtdB[15*maxCount + index] = -m13 + m33;
}
}
@ -208,48 +192,38 @@ __global__ void WinoTrans2Output(const T* matmulData,
const int unit,
const int block,
const int co,
const int ciDiv,
const int batch,
const int wLen,
const int hLen,
const int ePack,
const int hPack,
const int co_p8,
const int maxCount,
DivModFast hD,
DivModFast whD,
DivModFast wD,
const int width,
const int height
) {
size_t e = param->elh[0];
size_t hU = param->elhPack[2];
const size_t maxCount = e * hU * hPack;
const int h = co_p8;
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
const int hU_idx = index / (e * hPack);
const int tmp1 = index % (e * hPack);
const int e_idx = tmp1 / hPack;
const int hPack_idx = tmp1 % hPack;
int e_idx, co_idx, batch_idx, tmp, w_idx, h_idx;
hD.divmod(index, e_idx, co_idx);
whD.divmod(e_idx, batch_idx, tmp);
wD.divmod(tmp, h_idx, w_idx);
const int batch_idx = e_idx / (wLen * hLen);
const int tmp3 = e_idx % (wLen * hLen);
const int w_idx = tmp3 % wLen;
const int h_idx = tmp3 / wLen;
int basic_offset = (hU_idx * e + e_idx) * hPack + hPack_idx;
auto S00 = matmulData[basic_offset];
auto S10 = matmulData[basic_offset + maxCount];
auto S20 = matmulData[basic_offset + maxCount * 2];
auto S30 = matmulData[basic_offset + maxCount * 3];
auto S01 = matmulData[basic_offset + maxCount * 4];
auto S11 = matmulData[basic_offset + maxCount * 5];
auto S21 = matmulData[basic_offset + maxCount * 6];
auto S31 = matmulData[basic_offset + maxCount * 7];
auto S02 = matmulData[basic_offset + maxCount * 8];
auto S12 = matmulData[basic_offset + maxCount * 9];
auto S22 = matmulData[basic_offset + maxCount * 10];
auto S32 = matmulData[basic_offset + maxCount * 11];
auto S03 = matmulData[basic_offset + maxCount * 12];
auto S13 = matmulData[basic_offset + maxCount * 13];
auto S23 = matmulData[basic_offset + maxCount * 14];
auto S33 = matmulData[basic_offset + maxCount * 15];
auto S00 = matmulData[index];
auto S10 = matmulData[index + maxCount];
auto S20 = matmulData[index + maxCount * 2];
auto S30 = matmulData[index + maxCount * 3];
auto S01 = matmulData[index + maxCount * 4];
auto S11 = matmulData[index + maxCount * 5];
auto S21 = matmulData[index + maxCount * 6];
auto S31 = matmulData[index + maxCount * 7];
auto S02 = matmulData[index + maxCount * 8];
auto S12 = matmulData[index + maxCount * 9];
auto S22 = matmulData[index + maxCount * 10];
auto S32 = matmulData[index + maxCount * 11];
auto S03 = matmulData[index + maxCount * 12];
auto S13 = matmulData[index + maxCount * 13];
auto S23 = matmulData[index + maxCount * 14];
auto S33 = matmulData[index + maxCount * 15];
auto m00 = +S00 + S01 + S02;
auto m10 = +S10 + S11 + S12;
@ -261,18 +235,15 @@ __global__ void WinoTrans2Output(const T* matmulData,
auto m31 = +S31 - S32 + S33;
// write output
float bias = biasData[hU_idx * hPack + hPack_idx];
float bias = biasData[co_idx];
const int dxStart = w_idx * unit;
const int dyStart = h_idx * unit;
int co_p8 = ((co + 7) / 8) * 8;
int co_idx = hU_idx * hPack + hPack_idx;
if(co_idx >= co_p8) {
continue;
}
int out_offset = ((batch_idx * height + dyStart) * width + dxStart) * co_p8 + co_idx;
//(((hU_idx * batch + batch_idx) * height + dyStart) * width + dxStart) * 16 + hPack_idx;
/* if true */ {
float res = bias + (float)(m00 + m10 + m20);

View File

@ -16,56 +16,129 @@ namespace MNN {
void NPUBinary::OpInsert(int binary_type, string opName,
ge::Operator& input0, ge::Operator& input1,
const std::vector<Tensor *> &outputs){
const std::vector<Tensor *> &outputs, int activationType){
if(binary_type == BinaryOpOperation_ADD) {
shared_ptr<ge::op::Add> binary(new ge::op::Add(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_MUL) {
shared_ptr<ge::op::Mul> binary(new ge::op::Mul(opName));
(*binary)
.set_input_x(input0)
.set_input_y(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_REALDIV) {
shared_ptr<ge::op::RealDiv> binary(new ge::op::RealDiv(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_SUB) {
shared_ptr<ge::op::Sub> binary(new ge::op::Sub(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_MINIMUM) {
shared_ptr<ge::op::Minimum> binary(new ge::op::Minimum(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_MAXIMUM) {
shared_ptr<ge::op::Maximum> binary(new ge::op::Maximum(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_EQUAL) {
shared_ptr<ge::op::Equal> binary(new ge::op::Equal(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
} else if(binary_type == BinaryOpOperation_LESS_EQUAL) {
shared_ptr<hiai::op::LessEqual> binary(new hiai::op::LessEqual(opName));
(*binary)
.set_input_x1(input0)
.set_input_x2(input1);
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
if(activationType == 1) {
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
(*binary_activation)
.set_input_x(*binary.get())
.set_attr_mode(1);
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
} else {
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
}
}else{
MNN_ERROR("npu binary not support type : %d \n", binary_type);
MNN_ASSERT(false);
@ -149,21 +222,21 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
vector<pair<shared_ptr<ge::Operator>, string>> ops;
auto binary_type = mOp->main_as_BinaryOp()->opType();
int activationType = mOp->main_as_BinaryOp()->activationType();
if(!isConst0 && isConst1){
//
auto inputIndex0 = mOp->inputIndexes()->data()[0];
auto iops0 = mNpuBackend->mGrapMap[inputIndex0]; // x
auto xOp0 = iops0.back().first;
OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs);
OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs, activationType);
}else if(isConst0 && !isConst1){
//
auto inputIndex1 = mOp->inputIndexes()->data()[1];
auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x
auto xOp1 = iops1.back().first;
OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs);
OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs, activationType);
}else{
@ -177,7 +250,7 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x
auto xOp1 = iops1.back().first;
OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs);
OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs, activationType);
}

View File

@ -18,7 +18,7 @@ class NPUBinary : public NPUCommonExecution {
public:
void OpInsert(int binary_type, string opName,
ge::Operator& input0, ge::Operator& input1,
const std::vector<Tensor *> &outputs);
const std::vector<Tensor *> &outputs, int activationType);
NPUBinary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NPUBinary() = default;

View File

@ -2580,7 +2580,7 @@ const char* shader_MetalBinary_metal =
" int i0stride;\n"
" int i1stride;\n"
" int output_data_count;\n"
" int output_width;\n"
" int activationType;\n"
"};\n"
"#define define_op(op) "
"kernel void binary_##op##_x1(const device M *in0 [[buffer(0)]],"
@ -2591,7 +2591,11 @@ const char* shader_MetalBinary_metal =
" if ((int)gid >= s.output_data_count) return;"
" auto V0=in0[s.i0stride*int(gid)];"
" auto V1=in1[s.i1stride*int(gid)];"
" out[int(gid)]=op(V0,V1);"
" auto val=op(V0,V1);"
" if(s.activationType == 1) {"
" val=(val<(M)0 ? (M)0 : val);"
" }"
" out[int(gid)]=val;"
"}\n"
"static inline M add(M V1,M V2) {\n"
" return V1+V2;\n"

View File

@ -56,6 +56,9 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
printf("Error Key = %s\n", iter.first.c_str());
NSLog(@"Warning: Metallib Library error: %@", err);
}
[libraryMap removeAllObjects];
libraryMap = nil;
return;
}
auto functionNames = [library functionNames];
for(int i=0; i<functionNames.count ; i++) {
@ -102,7 +105,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
_caches = [NSMutableDictionary dictionary];
_waitings = [NSMutableArray array];
_isCommitEachShader = self.class.commit_frequent;
return nil != _library;
return (0 != [_library count]);
}
- (instancetype)init {

View File

@ -553,6 +553,7 @@ int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTen
if (toCpu) {
[ctx wait];
}
mFrameEncodeCache = false;
return 0;
}
@ -872,7 +873,7 @@ void registerMetalRuntimeCreator() {
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
if (nil != device) {
registerMetalOps();
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator(device), false);
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator(device), true);
} else {
MNN_ERROR("Init Metal Error\n");
}

View File

@ -17,7 +17,7 @@ namespace MNN {
class MetalBinary : public Execution {
public:
MetalBinary(Backend *backend, std::string type);
MetalBinary(Backend *backend, std::string type, const MNN::Op *op);
virtual ~MetalBinary() = default;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -26,6 +26,7 @@ private:
id<MTLBuffer> mConstBuffer;
id<MTLComputePipelineState> mPipeline;
std::pair<MTLSize, MTLSize> mThreads;
int mActivationType = 0;
};
} // namespace MNN

View File

@ -14,13 +14,14 @@
#if MNN_METAL_ENABLED
namespace MNN {
MetalBinary::MetalBinary(Backend *backend, std::string type) : Execution(backend) {
MetalBinary::MetalBinary(Backend *backend, std::string type, const MNN::Op *op) : Execution(backend) {
auto mKernelName = "binary_" + type + "_x1";
auto mtbn = static_cast<MetalBackend *>(backend);
auto context = (__bridge MNNMetalContext *)mtbn->context();
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
mPipeline = [context pipelineWithName:kn];
mActivationType = op->main_as_BinaryOp()->activationType();
}
ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto backend = static_cast<MetalBackend *>(this->backend());
@ -33,7 +34,7 @@ ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::
((int *)mConstBuffer.contents)[0] = input0_data_count == 1 ? 0 : 1;
((int *)mConstBuffer.contents)[1] = input1_data_count == 1 ? 0 : 1;
((int *)mConstBuffer.contents)[2] = outdatacount;
((int *)mConstBuffer.contents)[3] = 0;
((int *)mConstBuffer.contents)[3] = mActivationType;
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outdatacount, 1, 1)];
return NO_ERROR;
}
@ -94,7 +95,7 @@ public:
FUNC_PRINT(binaryop->opType());
return nullptr;
}
return new MetalBinary(backend, type);
return new MetalBinary(backend, type, op);
}
};
REGISTER_METAL_OP_CREATOR(MetalBinaryCreator, OpType_BinaryOp);

View File

@ -2,7 +2,7 @@ struct binary_op_shape {
int i0stride;
int i1stride;
int output_data_count;
int output_width;
int activationType;
};
#define define_op(op) \
kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\
@ -13,7 +13,11 @@ kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\
if ((int)gid >= s.output_data_count) return;\
auto value0 = in0[s.i0stride * int(gid)];\
auto value1 = in1[s.i1stride * int(gid)];\
out[int(gid)] = op(value0, value1);\
auto val = op(value0, value1);\
if(s.activationType == 1) {\
val = (val < (ftype)0 ? (ftype)0 : val);\
}\
out[int(gid)] = val;\
}
static inline ftype add(ftype value1, ftype value2) {

View File

@ -0,0 +1,20 @@
# Android NN API
file(GLOB MNN_NNAPI_SRCS
${CMAKE_CURRENT_LIST_DIR}/backend/*.cpp
${CMAKE_CURRENT_LIST_DIR}/backend/*.hpp
${CMAKE_CURRENT_LIST_DIR}/execution/*.cpp
${CMAKE_CURRENT_LIST_DIR}/execution/*.hpp
)
add_library(
MNN_NNAPI
STATIC
${MNN_NNAPI_SRCS}
)
target_compile_options(MNN_NNAPI PRIVATE -DMNN_NNAPI_ENABLED=1)
target_include_directories(MNN_NNAPI PRIVATE
${CMAKE_CURRENT_LIST_DIR}/backend
${CMAKE_CURRENT_LIST_DIR}/execution
)

View File

@ -0,0 +1,411 @@
//
// NNAPIBackend.cpp
// MNN
//
// Created by MNN on 2021/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPIBackend.hpp"
#include "backend/cpu/CPUTensorConvert.hpp"
#include <core/Macro.h>
#include <core/TensorUtils.hpp>
#include <stdlib.h>
#include <mutex>
#include <MNN/AutoTime.hpp>
// #define NNAPI_DEBUG
// #define USE_NCHW
#define CHECK(func, ...) \
do { \
const auto _status = (func(__VA_ARGS__)); \
if (_status != ANEURALNETWORKS_NO_ERROR) { \
const auto ENUM_TO_STR = NNAPIEnumToString(_status); \
MNN_PRINT("[NNAPI] Error: %s when call " #func " at line %d.\n", \
ENUM_TO_STR.c_str(), __LINE__); \
} \
} while (0)
namespace MNN {
void registerNNAPIOps();
static inline std::map<OpType, NNAPIBackend::Creator*>* getCreatorMap() {
static std::once_flag of;
static std::map<OpType, NNAPIBackend::Creator*>* ret = nullptr;
std::call_once(of, [&]() { ret = new std::map<OpType, NNAPIBackend::Creator*>; });
return ret;
}
std::string NNAPIEnumToString(int code) {
switch (code) {
#define ENUM_TO_STR(code) case ANEURALNETWORKS_##code: return #code
// ResultCode begin
ENUM_TO_STR(NO_ERROR);
ENUM_TO_STR(OUT_OF_MEMORY);
ENUM_TO_STR(INCOMPLETE);
ENUM_TO_STR(UNEXPECTED_NULL);
ENUM_TO_STR(BAD_DATA);
ENUM_TO_STR(OP_FAILED);
ENUM_TO_STR(BAD_STATE);
ENUM_TO_STR(UNMAPPABLE);
ENUM_TO_STR(OUTPUT_INSUFFICIENT_SIZE);
ENUM_TO_STR(UNAVAILABLE_DEVICE);
// ResultCode end
default:
return "UNKNOWN_ENUM";
#undef ENUM_TO_STR
}
}
bool NNAPIBackend::addCreator(OpType t, Creator* c) {
auto map = getCreatorMap();
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
return false;
}
map->insert(std::make_pair(t, c));
return true;
}
NNAPIBackend::NNAPIBackend(const NNAPIRuntime* runtime) : Backend(MNN_FORWARD_NN) {
mNPURuntime = runtime;
mPrecision = mNPURuntime->mPrecision;
#ifdef USE_NCHW
mNCHW = true;
#else
mNCHW = false;
#endif
MNN_PRINT("[NNAPI] DimensionFormat is %s\n", mNCHW ? "NCHW" : "NHWC");
if (mNNAPIModel == nullptr) {
CHECK(ANeuralNetworksModel_create_27, &mNNAPIModel);
}
if (mNNAPIDevices.empty()) {
uint32_t numDevices = 0;
CHECK(ANeuralNetworks_getDeviceCount_29, &numDevices);
mNNAPIDevices.resize(numDevices);
MNN_PRINT("[NNAPI] numDevices = %d\n", numDevices);
for (int i = 0; i < numDevices; i++) {
CHECK(ANeuralNetworks_getDevice_29, i, &mNNAPIDevices[i].device);
CHECK(ANeuralNetworksDevice_getName_29, mNNAPIDevices[i].device, &mNNAPIDevices[i].name);
CHECK(ANeuralNetworksDevice_getType_29, mNNAPIDevices[i].device, &mNNAPIDevices[i].type);
MNN_PRINT("[NNAPI] device %d is : %s, %d\n", i, mNNAPIDevices[i].name, mNNAPIDevices[i].type);
}
}
}
NNAPIBackend::~NNAPIBackend() {
ANeuralNetworksCompilation_free_27(mNNAPICompilation);
ANeuralNetworksModel_free_27(mNNAPIModel);
}
Execution* NNAPIBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) {
auto map = getCreatorMap();
auto iter = map->find(op->type());
if (iter == map->end()) {
MNN_PRINT("[NNAPI] Don't support type %s.\n", MNN::EnumNameOpType(op->type()));
return nullptr;
}
auto exe = iter->second->onCreate(inputs, outputs, op, this);
if (nullptr == exe) {
MNN_PRINT("[NNAPI] The Creator Don't support type %s.\n", MNN::EnumNameOpType(op->type()));
return nullptr;
}
return exe;
}
void NNAPIBackend::NNAPIBackend::onExecuteBegin() const {
}
void NNAPIBackend::onExecuteEnd() const {
invokeModel();
}
Backend::MemObj* NNAPIBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
bool isInputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
bool isOutputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
std::unique_ptr<Tensor> tensor_(new Tensor(tensor, mNCHW ? Tensor::DimensionType::CAFFE : Tensor::DimensionType::TENSORFLOW, true));
if(isInputCopy){
mInputTensors.push_back(tensor);
mInputContentTensors.push_back(std::move(tensor_));
mInputIdxMap.insert(std::make_pair(tensor, mInputIdxMap.size()));
}
if(isOutputCopy){
mOutputTensors.push_back(tensor);
mOutputContentTensors.push_back(std::move(tensor_));
mOutputIdxMap.insert(std::make_pair(tensor, mOutputIdxMap.size()));
// TensorUtils::getDescribe(tensor)->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
// const_cast<halide_buffer_t&>(tensor->buffer()).host = (uint8_t*)MNNMemoryAllocAlign(tensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
// MNN_ASSERT(tensor->buffer().host != nullptr);
}
getTensorIdx(tensor);
// Don't need release
return new Backend::MemObj;
}
bool NNAPIBackend::onClearBuffer() {
mInputContentTensors.clear();
mOutputContentTensors.clear();
mInputTensors.clear();
mOutputTensors.clear();
mInputIdxMap.clear();
mOutputIdxMap.clear();
return true;
}
void NNAPIBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
bool isInputCopy = TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
bool isOutputCopy = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
bool isConst = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT || TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT;
if(isConst){ return; }
if (isInputCopy) {
const auto iter = mInputIdxMap.find(dstTensor);
MNN_ASSERT(iter != mInputIdxMap.end());
// memcpy((void*)&mInputTensors[iter->second], &srcTensor, sizeof(void*));
auto code = CPUTensorConverter::convert(srcTensor, mInputContentTensors[iter->second].get());
if (NO_ERROR != code) {
MNN_ERROR("Error in NNAPIBackend::onCopyBuffer:convert\n");
}
} else if (isOutputCopy) {
const auto iter = mOutputIdxMap.find(srcTensor);
MNN_ASSERT(iter != mOutputIdxMap.end());
// memcpy(dstTensor->host<void>(), srcTensor->host<void>(), std::min(srcTensor->size(), dstTensor->size()));
auto code = CPUTensorConverter::convert(mOutputContentTensors[iter->second].get(), dstTensor);
if (NO_ERROR != code) {
MNN_ERROR("Error in NNAPIBackend::onCopyBuffer:convert\n");
}
}
}
void NNAPIBackend::onResizeBegin() {
}
void NNAPIBackend::onResizeEnd() {
buildModel();
}
uint32_t NNAPIBackend::getTensorIdx(const Tensor* t) {
const auto& iter = mTensorIdxMap.find(t);
if (iter != mTensorIdxMap.end()) {
return iter->second;
}
std::vector<uint32_t> dims;
for (auto d : t->shape()) {
dims.push_back(d);
}
std::vector<uint32_t> udims(dims.begin(), dims.end());
if (TensorUtils::getDescribe(t)->dimensionFormat != MNN_DATA_FORMAT_NHWC && !mNCHW) {
// NCHW -> NHWC
udims[0] = dims[0];
udims[1] = dims[2];
udims[2] = dims[3];
udims[3] = dims[1];
}
uint32_t idx = buildOperand(nullptr, 0, ANEURALNETWORKS_TENSOR_FLOAT32, udims);
mTensorIdxMap.insert(std::make_pair(t, idx));
return idx;
}
uint32_t NNAPIBackend::buildScalar(int scalar) {
auto iter = mScalarIntMap.find(scalar);
if (iter != mScalarIntMap.end()) {
return iter->second;
}
auto scalarIdx = buildOperand(&scalar, 4, ANEURALNETWORKS_INT32);
mScalarIntMap.insert(std::make_pair(scalar, scalarIdx));
return scalarIdx;
}
uint32_t NNAPIBackend::buildScalar(bool scalar) {
auto iter = mScalarBoolMap.find(scalar);
if (iter != mScalarBoolMap.end()) {
return iter->second;
}
uint8_t value = scalar;
auto scalarIdx = buildOperand(&value, 1, ANEURALNETWORKS_BOOL);
mScalarBoolMap.insert(std::make_pair(scalar, scalarIdx));
return scalarIdx;
}
uint32_t NNAPIBackend::buildScalar(float scalar) {
auto iter = mScalarFloatMap.find(scalar);
if (iter != mScalarFloatMap.end()) {
return iter->second;
}
auto scalarIdx = buildOperand(&scalar, 4, ANEURALNETWORKS_FLOAT32);
mScalarFloatMap.insert(std::make_pair(scalar, scalarIdx));
return scalarIdx;
}
uint32_t NNAPIBackend::buildOperand(const void* data, size_t size, OperandCode code, std::vector<uint32_t> dims) {
ANeuralNetworksOperandType operandType {
.type = code,
.dimensionCount = static_cast<uint32_t>(dims.size()),
.dimensions = dims.empty() ? nullptr : dims.data(),
.scale = 0.0f,
.zeroPoint = 0,
};
CHECK(ANeuralNetworksModel_addOperand_27, mNNAPIModel, &operandType);
uint32_t operandIdx = mTensorIdx++;
#ifdef NNAPI_DEBUG
MNN_PRINT("build operand : {\n");
MNN_PRINT("\tidx : %d\n", operandIdx);
MNN_PRINT("\tdata : %p\n", data);
MNN_PRINT("\tsize : %d\n", size);
MNN_PRINT("\ttype : %d\n", operandType.type);
MNN_PRINT("\tdimensions : [ ");
for (auto i : dims) MNN_PRINT("%d, ", i);
MNN_PRINT("]\n}\n");
#endif
if (data && size) {
CHECK(ANeuralNetworksModel_setOperandValue_27, mNNAPIModel, operandIdx, data, size);
}
return operandIdx;
}
ErrorCode NNAPIBackend::buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs, const char* name) {
#ifdef NNAPI_DEBUG
MNN_PRINT("build operation : {\n");
MNN_PRINT("\ttype : %d\n", op);
MNN_PRINT("\tinputs : [ ");
for (auto i : inputs) MNN_PRINT("%d, ", i);
MNN_PRINT("]\n\toutputs : [ ");
for (auto i : outputs) MNN_PRINT("%d, ", i);
MNN_PRINT("]\n}\n");
#endif
if (name) mOpNames.push_back(name);
CHECK(ANeuralNetworksModel_addOperation_27,
mNNAPIModel, op,
inputs.size(), inputs.data(),
outputs.size(), outputs.data());
return NO_ERROR;
}
void NNAPIBackend::buildModel() {
// set input and output of model
std::vector<uint32_t> inputOperands(mInputTensors.size()), outputOperands(mOutputTensors.size());
for (int i = 0; i < mInputTensors.size(); i++) {
inputOperands[i] = getTensorIdx(mInputTensors[i]);
}
for (int i = 0; i < mOutputTensors.size(); i++) {
outputOperands[i] = getTensorIdx(mOutputTensors[i]);
}
#ifdef NNAPI_DEBUG
MNN_PRINT("set model's inputs & outputs : {\n");
MNN_PRINT("\tinputs : [ ");
for (auto i : inputOperands) MNN_PRINT("%d, ", i);
MNN_PRINT("]\n\toutputs : [ ");
for (auto i : outputOperands) MNN_PRINT("%d, ", i);
MNN_PRINT("]\n}\n");
#endif
CHECK(ANeuralNetworksModel_identifyInputsAndOutputs_27,
mNNAPIModel,
inputOperands.size(),
inputOperands.data(),
outputOperands.size(),
outputOperands.data());
// segment fault
CHECK(ANeuralNetworksModel_finish_27, mNNAPIModel);
std::unique_ptr<bool[]> supports(new bool[mOpNames.size()]);
int selectDeviceIdx = -1;
for (int i = 0; i < mNNAPIDevices.size(); i++) {
auto device = mNNAPIDevices[i].device;
auto name = mNNAPIDevices[i].name;
auto type = mNNAPIDevices[i].type;
CHECK(ANeuralNetworksModel_getSupportedOperationsForDevices_29, mNNAPIModel, &device, 1, supports.get());
MNN_PRINT("[NNAPI] device [%d : %s] supportOps = {\n", i, name);
bool allsupport = true;
for (int i = 0; i < mOpNames.size(); i++) {
allsupport &= supports[i];
MNN_PRINT("\t%s : %d\n", mOpNames[i], supports[i]);
}
MNN_PRINT("}\n");
if (allsupport) {
selectDeviceIdx = i;
MNN_PRINT("[NNAPI] using device [%d : %s : %d].\n", i, name, type);
break;
}
}
MNN_PRINT("[NNAPI] using device [%d : %s].\n", selectDeviceIdx, mNNAPIDevices[selectDeviceIdx].name);
CHECK(ANeuralNetworksCompilation_createForDevices_29, mNNAPIModel, &mNNAPIDevices[selectDeviceIdx].device, 1, &mNNAPICompilation);
CHECK(ANeuralNetworksCompilation_setPreference_27, mNNAPICompilation, ANEURALNETWORKS_PREFER_SUSTAINED_SPEED);
CHECK(ANeuralNetworksCompilation_finish_27, mNNAPICompilation);
CHECK(ANeuralNetworksBurst_create_29, mNNAPICompilation, &mNNAPIBurst);
}
void NNAPIBackend::invokeModel() const {
// #define NNAPI_PROFILE
ANeuralNetworksExecution *execution;
CHECK(ANeuralNetworksExecution_create_27, mNNAPICompilation, &execution);
#ifdef NNAPI_PROFILE
CHECK(ANeuralNetworksExecution_setMeasureTiming, execution, true);
#endif
for (int i = 0; i < mInputTensors.size(); i++) {
const void* data = mInputContentTensors[i]->host<void>();
size_t size = mInputContentTensors[i]->size();
CHECK(ANeuralNetworksExecution_setInput_27, execution, i, nullptr, data, size);
}
for (int i = 0; i < mOutputTensors.size(); i++) {
void* data = mOutputContentTensors[i]->host<void>();
size_t size = mOutputContentTensors[i]->size();
CHECK(ANeuralNetworksExecution_setOutput_27, execution, i, nullptr, data, size);
}
#if 0
ANeuralNetworksEvent *event = nullptr;
CHECK(ANeuralNetworksExecution_startCompute, execution, &event);
CHECK(ANeuralNetworksEvent_wait, event);
ANeuralNetworksEvent_free(event);
#else
CHECK(ANeuralNetworksExecution_compute_29, execution);
// CHECK(ANeuralNetworksExecution_burstCompute_29, execution, mNNAPIBurst);
#endif
#ifdef NNAPI_PROFILE
uint64_t duration;
CHECK(ANeuralNetworksExecution_getDuration, execution, ANEURALNETWORKS_DURATION_IN_DRIVER, &duration);
if (duration != UINT64_MAX) MNN_PRINT("[NNAPI] driver time : %f ms\n", duration / 1000000.0);
CHECK(ANeuralNetworksExecution_getDuration, execution, ANEURALNETWORKS_DURATION_ON_HARDWARE, &duration);
if (duration != UINT64_MAX) MNN_PRINT("[NNAPI] hardware time : %f ms\n", duration / 1000000.0);
#endif
ANeuralNetworksExecution_free_27(execution);
}
NNAPIRuntime::NNAPIRuntime(const Backend::Info& info) {
mInfo = info;
BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
BackendConfig::PowerMode power = BackendConfig::Power_Normal;
if (nullptr != mInfo.user) {
precision = mInfo.user->precision;
power = mInfo.user->power;
}
mPrecision = precision;
}
NNAPIRuntime::~NNAPIRuntime() {}
Backend* NNAPIRuntime::onCreate(const BackendConfig* config) const {
return new NNAPIBackend(this);
}
void NNAPIRuntime::onGabageCollect(int level) {
// nothing now
}
NNAPIRuntime::CompilerType NNAPIRuntime::onGetCompilerType() const {
return Compiler_Geometry;
}
struct NNAPIBackendCreator : RuntimeCreator {
virtual Runtime* onCreate(const Backend::Info& info) const override {
return new NNAPIRuntime(info);
}
virtual bool onValid(Backend::Info& info) const override {
return true;
}
};
void registerNNAPIRuntimeCreator() {
if (!loadNNAPISymbol()) {
return;
}
registerNNAPIOps();
MNNInsertExtraRuntimeCreator(MNN_FORWARD_NN, new NNAPIBackendCreator, true);
}
}

View File

@ -0,0 +1,134 @@
//
// NNAPIBackend.hpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPIBACKEND_H
#define MNN_NNAPIBACKEND_H
#include <stdio.h>
#include <map>
#include <memory>
#include <core/Backend.hpp>
#include <core/Execution.hpp>
#include <core/TensorUtils.hpp>
#include "MNN_generated.h"
#include "NNAPIDefine.hpp"
#include "NNAPISymbol.hpp"
namespace MNN {
class NNAPIRuntime : public Runtime {
public:
NNAPIRuntime(const Backend::Info& info);
virtual ~NNAPIRuntime();
virtual CompilerType onGetCompilerType() const override;
virtual Backend* onCreate(const BackendConfig* conf) const override;
virtual void onGabageCollect(int level) override;
virtual std::pair<const void*, size_t> onGetCache() override {
return std::make_pair(mCacheBuffer, mCacheSize);
}
private:
Backend::Info mInfo;
BackendConfig::PrecisionMode mPrecision;
const void* mCacheBuffer = nullptr;
size_t mCacheSize = 0;
friend class NNAPIBackend;
};
class NNAPIBackend : public Backend {
public:
NNAPIBackend(const NNAPIRuntime* runtime);
virtual ~NNAPIBackend();
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) override;
virtual void onExecuteBegin() const override;
virtual void onExecuteEnd() const override;
virtual Backend::MemObj* onAcquire(const Tensor* tensor, StorageType storageType) override;
virtual bool onClearBuffer() override;
virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
virtual void onResizeBegin() override;
virtual void onResizeEnd() override;
public:
class Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const = 0;
};
static bool addCreator(OpType t, Creator* c);
// NNAPI functions
bool NCHW() { return mNCHW; }
uint32_t getTensorIdx(const Tensor* t);
uint32_t buildScalar(int scalar);
uint32_t buildScalar(bool scalar);
uint32_t buildScalar(float scalar);
uint32_t buildOperand(const void* data, size_t size, OperandCode code, std::vector<uint32_t> dims = {});
ErrorCode buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs, const char* name = nullptr);
void buildModel();
void invokeModel() const;
private:
bool mNCHW = false;
std::vector<std::string> mModelName;
const NNAPIRuntime* mNPURuntime;
BackendConfig::PrecisionMode mPrecision;
std::vector<std::unique_ptr<Tensor>> mInputContentTensors, mOutputContentTensors;
std::vector<const Tensor*> mInputTensors, mOutputTensors;
// tensor idx map
std::map<const Tensor*, uint32_t> mTensorIdxMap, mInputIdxMap, mOutputIdxMap;
uint32_t mTensorIdx = 0;
std::vector<const char*> mOpNames;
// scalar idx map
std::map<int, uint32_t> mScalarIntMap;
std::map<bool, uint32_t> mScalarBoolMap;
std::map<float, uint32_t> mScalarFloatMap;
// NNAPI resource
struct NNAPIDevice {
ANeuralNetworksDevice* device;
const char* name;
int32_t type;
};
std::vector<NNAPIDevice> mNNAPIDevices;
ANeuralNetworksModel *mNNAPIModel = nullptr;
ANeuralNetworksCompilation *mNNAPICompilation = nullptr;
ANeuralNetworksBurst* mNNAPIBurst = NULL;
};
template <class T>
class NNAPICreatorRegister {
public:
NNAPICreatorRegister(OpType type) {
T *t = new T;
NNAPIBackend::addCreator(type, t);
}
~NNAPICreatorRegister() = default;
};
template <typename T>
class TypedCreator : public NNAPIBackend::Creator {
public:
virtual ~TypedCreator() = default;
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op,
Backend *backend) const override {
auto newOp = new T(backend, op, inputs, outputs);
return newOp;
}
};
#define REGISTER_NNAPI_OP_CREATOR(name, opType) \
void ___##name##__##opType##__() { \
static TypedCreator<name> _temp;\
NNAPIBackend::addCreator(opType, &_temp); \
}
}
#endif //MNN_NNAPIBACKEND_H

View File

@ -0,0 +1,23 @@
//
// NNAPIDefine.hpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef NNAPIDefine_h
#define NNAPIDefine_h
#ifdef MNN_NNAPI_ENABLED
#ifdef __ANDROID__
#include "NNAPINeuralNetworks.h"
#define ANDROID_API_LEVEL (android_get_device_api_level())
#else
#undef MNN_NNAPI_ENABLED
#define MNN_NNAPI_ENABLED 0
#define ANDROID_API_LEVEL (0)
#endif
#endif
#endif /* NNAPIDefine_h */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,18 @@
// This file is generated by Shell for ops register
namespace MNN {
extern void ___NNAPIUnary__OpType_UnaryOp__();
extern void ___NNAPISoftmax__OpType_Softmax__();
extern void ___NNAPIConvolution__OpType_Convolution__();
extern void ___NNAPIConvolution__OpType_ConvolutionDepthwise__();
extern void ___NNAPIBinary__OpType_BinaryOp__();
extern void ___NNAPIPool__OpType_Pooling__();
void registerNNAPIOps() {
___NNAPIUnary__OpType_UnaryOp__();
___NNAPISoftmax__OpType_Softmax__();
___NNAPIConvolution__OpType_Convolution__();
___NNAPIConvolution__OpType_ConvolutionDepthwise__();
___NNAPIBinary__OpType_BinaryOp__();
___NNAPIPool__OpType_Pooling__();
}
}

View File

@ -0,0 +1,103 @@
//
// NNAPISymbol.cpp
// MNN
//
// Created by MNN on 2022/09/19.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPISymbol.hpp"
#include "NNAPIDefine.hpp"
#include <MNN/MNNDefine.h>
#include <dlfcn.h>
namespace MNN {
#define LOAD_SYM(NAME, API_LEVEL) \
NAME ## _ ## API_LEVEL = reinterpret_cast<decltype(NAME ## _ ## API_LEVEL)>(dlsym(lib, #NAME)); \
if (NAME ## _ ## API_LEVEL == nullptr) { \
MNN_PRINT("[NNAPI] Load symbol %s failed.", #NAME); \
return false; \
}
bool loadNNAPISymbol() {
if (ANDROID_API_LEVEL < 29) {
return false;
}
void *lib = dlopen("libneuralnetworks.so", RTLD_NOW | RTLD_LOCAL);
if (lib == nullptr) {
return false;
}
LOAD_SYM(ANeuralNetworksModel_getSupportedOperationsForDevices, 29);
LOAD_SYM(ANeuralNetworks_getDeviceCount, 29);
LOAD_SYM(ANeuralNetworks_getDevice, 29);
LOAD_SYM(ANeuralNetworksDevice_getName, 29);
LOAD_SYM(ANeuralNetworksDevice_getType, 29);
LOAD_SYM(ANeuralNetworksCompilation_createForDevices, 29);
LOAD_SYM(ANeuralNetworksExecution_compute, 29);
LOAD_SYM(ANeuralNetworksBurst_create, 29);
LOAD_SYM(ANeuralNetworksBurst_free, 29);
LOAD_SYM(ANeuralNetworksExecution_burstCompute, 29);
LOAD_SYM(ANeuralNetworksModel_create, 27);
LOAD_SYM(ANeuralNetworksModel_free, 27);
LOAD_SYM(ANeuralNetworksModel_finish, 27);
LOAD_SYM(ANeuralNetworksModel_addOperand, 27);
LOAD_SYM(ANeuralNetworksModel_setOperandValue, 27);
LOAD_SYM(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams, 29);
LOAD_SYM(ANeuralNetworksModel_addOperation, 27);
LOAD_SYM(ANeuralNetworksModel_identifyInputsAndOutputs, 27);
LOAD_SYM(ANeuralNetworksCompilation_create, 27);
LOAD_SYM(ANeuralNetworksCompilation_free, 27);
LOAD_SYM(ANeuralNetworksCompilation_setPreference, 27);
LOAD_SYM(ANeuralNetworksCompilation_finish, 27);
LOAD_SYM(ANeuralNetworksExecution_create, 27);
LOAD_SYM(ANeuralNetworksExecution_free, 27);
LOAD_SYM(ANeuralNetworksExecution_setInput, 27);
LOAD_SYM(ANeuralNetworksExecution_setInputFromMemory, 27);
LOAD_SYM(ANeuralNetworksExecution_setOutput, 27);
LOAD_SYM(ANeuralNetworksExecution_setOutputFromMemory, 27);
LOAD_SYM(ANeuralNetworksExecution_startCompute, 27);
LOAD_SYM(ANeuralNetworksEvent_wait, 27);
LOAD_SYM(ANeuralNetworksEvent_free, 27);
LOAD_SYM(ANeuralNetworksDevice_getVersion, 29);
LOAD_SYM(ANeuralNetworksMemory_createFromAHardwareBuffer, 29);
LOAD_SYM(ANeuralNetworksMemory_createFromFd, 27);
LOAD_SYM(ANeuralNetworksMemory_free, 27);
return true;
}
MNN_ANeuralNetworksModel_getSupportedOperationsForDevices *ANeuralNetworksModel_getSupportedOperationsForDevices_29 = nullptr;
MNN_ANeuralNetworks_getDeviceCount *ANeuralNetworks_getDeviceCount_29 = nullptr;
MNN_ANeuralNetworks_getDevice *ANeuralNetworks_getDevice_29 = nullptr;
MNN_ANeuralNetworksDevice_getName *ANeuralNetworksDevice_getName_29 = nullptr;
MNN_ANeuralNetworksDevice_getType *ANeuralNetworksDevice_getType_29 = nullptr;
MNN_ANeuralNetworksCompilation_createForDevices *ANeuralNetworksCompilation_createForDevices_29 = nullptr;
MNN_ANeuralNetworksExecution_compute *ANeuralNetworksExecution_compute_29 = nullptr;
MNN_ANeuralNetworksBurst_create *ANeuralNetworksBurst_create_29 = nullptr;
MNN_ANeuralNetworksBurst_free *ANeuralNetworksBurst_free_29 = nullptr;
MNN_ANeuralNetworksExecution_burstCompute *ANeuralNetworksExecution_burstCompute_29 = nullptr;
MNN_ANeuralNetworksModel_create *ANeuralNetworksModel_create_27 = nullptr;
MNN_ANeuralNetworksModel_finish *ANeuralNetworksModel_finish_27 = nullptr;
MNN_ANeuralNetworksModel_free *ANeuralNetworksModel_free_27 = nullptr;
MNN_ANeuralNetworksModel_addOperand *ANeuralNetworksModel_addOperand_27 = nullptr;
MNN_ANeuralNetworksModel_setOperandValue *ANeuralNetworksModel_setOperandValue_27 = nullptr;
MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams *ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_29;
MNN_ANeuralNetworksModel_addOperation *ANeuralNetworksModel_addOperation_27 = nullptr;
MNN_ANeuralNetworksModel_identifyInputsAndOutputs *ANeuralNetworksModel_identifyInputsAndOutputs_27 = nullptr;
MNN_ANeuralNetworksCompilation_create *ANeuralNetworksCompilation_create_27 = nullptr;
MNN_ANeuralNetworksCompilation_free *ANeuralNetworksCompilation_free_27 = nullptr;
MNN_ANeuralNetworksCompilation_setPreference *ANeuralNetworksCompilation_setPreference_27 = nullptr;
MNN_ANeuralNetworksCompilation_finish *ANeuralNetworksCompilation_finish_27 = nullptr;
MNN_ANeuralNetworksExecution_create *ANeuralNetworksExecution_create_27 = nullptr;
MNN_ANeuralNetworksExecution_free *ANeuralNetworksExecution_free_27 = nullptr;
MNN_ANeuralNetworksExecution_setInput *ANeuralNetworksExecution_setInput_27 = nullptr;
MNN_ANeuralNetworksExecution_setInputFromMemory *ANeuralNetworksExecution_setInputFromMemory_27 = nullptr;
MNN_ANeuralNetworksExecution_setOutput *ANeuralNetworksExecution_setOutput_27 = nullptr;
MNN_ANeuralNetworksExecution_setOutputFromMemory *ANeuralNetworksExecution_setOutputFromMemory_27 = nullptr;
MNN_ANeuralNetworksExecution_startCompute *ANeuralNetworksExecution_startCompute_27 = nullptr;
MNN_ANeuralNetworksEvent_wait *ANeuralNetworksEvent_wait_27 = nullptr;
MNN_ANeuralNetworksEvent_free *ANeuralNetworksEvent_free_27 = nullptr;
MNN_ANeuralNetworksDevice_getVersion *ANeuralNetworksDevice_getVersion_29 = nullptr;
MNN_ANeuralNetworksMemory_createFromAHardwareBuffer *ANeuralNetworksMemory_createFromAHardwareBuffer_29 = nullptr;
MNN_ANeuralNetworksMemory_createFromFd *ANeuralNetworksMemory_createFromFd_27 = nullptr;
MNN_ANeuralNetworksMemory_free *ANeuralNetworksMemory_free_27 = nullptr;
}

View File

@ -0,0 +1,89 @@
//
// NNAPISymbol.hpp
// MNN
//
// Created by MNN on 2022/09/19.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef NNAPISymbols_h
#define NNAPISymbols_h
#include "NNAPIDefine.hpp"
namespace MNN {
// typedef the function in nnapi will be used
typedef int (MNN_ANeuralNetworksModel_getSupportedOperationsForDevices)(const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps);
typedef int (MNN_ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
typedef int (MNN_ANeuralNetworks_getDevice)(uint32_t devIndex, ANeuralNetworksDevice** device);
typedef int (MNN_ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device, const char** name);
typedef int (MNN_ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device, int32_t* type);
typedef int (MNN_ANeuralNetworksCompilation_createForDevices)(ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation);
typedef int (MNN_ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
typedef int (MNN_ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation, ANeuralNetworksBurst** burst);
typedef void (MNN_ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
typedef int (MNN_ANeuralNetworksExecution_burstCompute)(ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
typedef int (MNN_ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
typedef void (MNN_ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
typedef int (MNN_ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
typedef int (MNN_ANeuralNetworksModel_addOperand)(ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
typedef int (MNN_ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length);
typedef int (MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
typedef int (MNN_ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
typedef int (MNN_ANeuralNetworksModel_identifyInputsAndOutputs)(ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
typedef int (MNN_ANeuralNetworksCompilation_create)(ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
typedef void (MNN_ANeuralNetworksCompilation_free)(ANeuralNetworksCompilation* compilation);
typedef int (MNN_ANeuralNetworksCompilation_setPreference)(ANeuralNetworksCompilation* compilation, int32_t preference);
typedef int (MNN_ANeuralNetworksCompilation_finish)(ANeuralNetworksCompilation* compilation);
typedef int (MNN_ANeuralNetworksExecution_create)(ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution);
typedef void (MNN_ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
typedef int (MNN_ANeuralNetworksExecution_setInput)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
typedef int (MNN_ANeuralNetworksExecution_setInputFromMemory)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
typedef int (MNN_ANeuralNetworksExecution_setOutput)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length);
typedef int (MNN_ANeuralNetworksExecution_setOutputFromMemory)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
typedef int (MNN_ANeuralNetworksExecution_startCompute)(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
typedef int (MNN_ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
typedef void (MNN_ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
typedef int (MNN_ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device, const char** version);
typedef int (MNN_ANeuralNetworksMemory_createFromAHardwareBuffer)(const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
typedef int (MNN_ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory **memory);
typedef void (MNN_ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
// symbols
bool loadNNAPISymbol();
extern MNN_ANeuralNetworksModel_getSupportedOperationsForDevices *ANeuralNetworksModel_getSupportedOperationsForDevices_29;
extern MNN_ANeuralNetworks_getDeviceCount *ANeuralNetworks_getDeviceCount_29;
extern MNN_ANeuralNetworks_getDevice *ANeuralNetworks_getDevice_29;
extern MNN_ANeuralNetworksDevice_getName *ANeuralNetworksDevice_getName_29;
extern MNN_ANeuralNetworksDevice_getType *ANeuralNetworksDevice_getType_29;
extern MNN_ANeuralNetworksCompilation_createForDevices *ANeuralNetworksCompilation_createForDevices_29;
extern MNN_ANeuralNetworksExecution_compute *ANeuralNetworksExecution_compute_29;
extern MNN_ANeuralNetworksBurst_create *ANeuralNetworksBurst_create_29;
extern MNN_ANeuralNetworksBurst_free *ANeuralNetworksBurst_free_29;
extern MNN_ANeuralNetworksExecution_burstCompute *ANeuralNetworksExecution_burstCompute_29;
extern MNN_ANeuralNetworksModel_create *ANeuralNetworksModel_create_27;
extern MNN_ANeuralNetworksModel_free *ANeuralNetworksModel_free_27;
extern MNN_ANeuralNetworksModel_finish *ANeuralNetworksModel_finish_27;
extern MNN_ANeuralNetworksModel_addOperand *ANeuralNetworksModel_addOperand_27;
extern MNN_ANeuralNetworksModel_setOperandValue *ANeuralNetworksModel_setOperandValue_27;
extern MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams *ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_29;
extern MNN_ANeuralNetworksModel_addOperation *ANeuralNetworksModel_addOperation_27;
extern MNN_ANeuralNetworksModel_identifyInputsAndOutputs *ANeuralNetworksModel_identifyInputsAndOutputs_27;
extern MNN_ANeuralNetworksCompilation_create *ANeuralNetworksCompilation_create_27;
extern MNN_ANeuralNetworksCompilation_free *ANeuralNetworksCompilation_free_27;
extern MNN_ANeuralNetworksCompilation_setPreference *ANeuralNetworksCompilation_setPreference_27;
extern MNN_ANeuralNetworksCompilation_finish *ANeuralNetworksCompilation_finish_27;
extern MNN_ANeuralNetworksExecution_create *ANeuralNetworksExecution_create_27;
extern MNN_ANeuralNetworksExecution_free *ANeuralNetworksExecution_free_27;
extern MNN_ANeuralNetworksExecution_setInput *ANeuralNetworksExecution_setInput_27;
extern MNN_ANeuralNetworksExecution_setInputFromMemory *ANeuralNetworksExecution_setInputFromMemory_27;
extern MNN_ANeuralNetworksExecution_setOutput *ANeuralNetworksExecution_setOutput_27;
extern MNN_ANeuralNetworksExecution_setOutputFromMemory *ANeuralNetworksExecution_setOutputFromMemory_27;
extern MNN_ANeuralNetworksExecution_startCompute *ANeuralNetworksExecution_startCompute_27;
extern MNN_ANeuralNetworksEvent_wait *ANeuralNetworksEvent_wait_27;
extern MNN_ANeuralNetworksEvent_free *ANeuralNetworksEvent_free_27;
extern MNN_ANeuralNetworksDevice_getVersion *ANeuralNetworksDevice_getVersion_29;
extern MNN_ANeuralNetworksMemory_createFromAHardwareBuffer *ANeuralNetworksMemory_createFromAHardwareBuffer_29;
extern MNN_ANeuralNetworksMemory_createFromFd *ANeuralNetworksMemory_createFromFd_27;
extern MNN_ANeuralNetworksMemory_free *ANeuralNetworksMemory_free_27;
}
#endif /* NNAPISymbols_h */

View File

@ -0,0 +1,37 @@
//
// NNAPIBinary.cpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPIBinary.hpp"
namespace MNN {
NNAPIBinary::NNAPIBinary(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
}
ErrorCode NNAPIBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 2 && outputs.size() == 1);
std::map<BinaryOpOperation, int> binary_map {
{BinaryOpOperation_ADD, ANEURALNETWORKS_ADD},
{BinaryOpOperation_SUB, ANEURALNETWORKS_SUB},
{BinaryOpOperation_MUL, ANEURALNETWORKS_MUL},
{BinaryOpOperation_DIV, ANEURALNETWORKS_DIV}
};
auto opType = static_cast<BinaryOpOperation>(mOp->main_as_BinaryOp()->opType());
auto iter = binary_map.find(opType);
if (iter == binary_map.end() || iter->second < 0) {
MNN_ERROR("[NNAPI] Binary not support %s\n", MNN::EnumNameBinaryOpOperation(opType));
return NOT_SUPPORT;
}
auto inputIdxs = getTensorIdxs(inputs);
inputIdxs.push_back(buildScalar(ANEURALNETWORKS_FUSED_NONE));
return buildOperation(iter->second, inputIdxs, getTensorIdxs(outputs));
}
REGISTER_NNAPI_OP_CREATOR(NNAPIBinary, OpType_BinaryOp)
} // namespace MNN

View File

@ -0,0 +1,25 @@
//
// NNAPIBinary.hpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPIBINARY_HPP
#define MNN_NNAPIBINARY_HPP
#include "NNAPIBackend.hpp"
#include "NNAPICommonExecution.hpp"
namespace MNN {
class NNAPIBinary : public NNAPICommonExecution {
public:
NNAPIBinary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NNAPIBinary() = default;
};
} // namespace MNN
#endif // MNN_NNAPIBINARY_HPP

View File

@ -0,0 +1,53 @@
//
// NNAPICommonExecution.cpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPICommonExecution.hpp"
namespace MNN {
NNAPICommonExecution::NNAPICommonExecution(Backend *backend, const Op *op) : Execution(backend), mOp(op) {
mNNAPIBackend = (NNAPIBackend*)backend;
mNCHW = mNNAPIBackend->NCHW();
}
ErrorCode NNAPICommonExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
return NO_ERROR;
}
ErrorCode NNAPICommonExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
return NO_ERROR;
}
std::vector<uint32_t> NNAPICommonExecution::getTensorIdxs(const std::vector<Tensor*>& tensors) {
std::vector<uint32_t> idxs(tensors.size());
for (int i = 0; i < tensors.size(); i++) {
idxs[i] = mNNAPIBackend->getTensorIdx(tensors[i]);
}
return idxs;
}
uint32_t NNAPICommonExecution::buildConstant(const void* data, size_t size, OperandCode dtype, std::vector<uint32_t> dims) {
return mNNAPIBackend->buildOperand(data, size, dtype, dims);
}
uint32_t NNAPICommonExecution::buildTensor(OperandCode dtype, std::vector<int> dims) {
std::vector<uint32_t> udims(dims.begin(), dims.end());
if (!mNCHW) {
// NCHW -> NHWC
udims[0] = dims[0];
udims[1] = dims[2];
udims[2] = dims[3];
udims[3] = dims[1];
}
return mNNAPIBackend->buildOperand(nullptr, 0, dtype, udims);
}
ErrorCode NNAPICommonExecution::buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs) {
auto name = mOp->name() ? mOp->name()->c_str() : EnumNameOpType(mOp->type());
return mNNAPIBackend->buildOperation(op, inputs, outputs, name);
}
}; // namespace MNN

View File

@ -0,0 +1,36 @@
//
// NNAPICommonExecution.hpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPICOMMONEXECUTION_HPP
#define MNN_NNAPICOMMONEXECUTION_HPP
#include "core/Execution.hpp"
#include "NNAPIBackend.hpp"
#include <memory>
namespace MNN {
class NNAPICommonExecution : public Execution {
public:
NNAPICommonExecution(Backend *backend, const Op *op);
virtual ~NNAPICommonExecution() = default;
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
protected:
bool mNCHW;
std::vector<uint32_t> getTensorIdxs(const std::vector<Tensor*>& tensors);
template <typename T> inline uint32_t buildScalar(T scalar) { return mNNAPIBackend->buildScalar(scalar); }
uint32_t buildConstant(const void* data, size_t size, OperandCode dtype, std::vector<uint32_t> dims = {});
uint32_t buildTensor(OperandCode dtype, std::vector<int> dims);
ErrorCode buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs);
NNAPIBackend* mNNAPIBackend;
const Op* mOp;
};
} // namespace MNN
#endif // MNN_NNAPICOMMONEXECUTION_HPP

View File

@ -0,0 +1,138 @@
//
// NNAPIConvolution.cpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPIConvolution.hpp"
namespace MNN {
NNAPIConvolution::NNAPIConvolution(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
isDepthwise = mOp->type() == OpType_ConvolutionDepthwise;
isDeconv = mOp->type() == OpType_Deconvolution;
}
template<typename T>
static void NCHW2NHWC(const T* source, T* dest, int b, int c, int area) {
int sourceBatchsize = c * area;
int destBatchSize = sourceBatchsize;
for (int bi = 0; bi < b; ++bi) {
auto srcBatch = source + bi * sourceBatchsize;
auto dstBatch = dest + bi * destBatchSize;
for (int i = 0; i < area; ++i) {
auto srcArea = srcBatch + i;
auto dstArea = dstBatch + i * c;
for (int ci = 0; ci < c; ++ci) {
dstArea[ci] = srcArea[ci * area];
}
}
}
}
ErrorCode NNAPIConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto conv2D = mOp->main_as_Convolution2D();
auto common = conv2D->common();
int kernelX = common->kernelX();
int kernelY = common->kernelY();
int strideX = common->strideX();
int strideY = common->strideY();
int dilateX = common->dilateX();
int dilateY = common->dilateY();
int group = common->group();
uint32_t outputCount = common->outputCount();
auto padMod = common->padMode();
bool relu = common->relu();
bool relu6 = common->relu6();
int top, left, bottom, right;
if (nullptr != common->pads()) {
MNN_ASSERT(common->pads()->size() >= 4);
top = common->pads()->Get(0);
left = common->pads()->Get(1);
bottom = common->pads()->Get(2);
right = common->pads()->Get(3);
} else {
top = common->padY();
left = common->padX();
bottom = common->padY();
right = common->padX();
}
// NNAPI inputs:
// conv2d: [input, weight, bias, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, fusecode, NCHW/NHWC, dilate_w, dilate_h]
// depthwise_conv2d: [input, weight, bias, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, multiplier, fusecode, NCHW/NHWC, dilate_w, dilate_h]
auto inputIdxs = getTensorIdxs(inputs);
// inputs not contain weight and bias, read from param
if (inputs.size() < 3) {
const void *weightPtr, *biasPtr;
int weightSize, biasSize;
if (nullptr != conv2D->quanParameter()) {
quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
if (nullptr == quanCommon) {
MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
}
if (quanCommon->weightFloat.get() == nullptr) {
MNN_PRINT("quanCommon->weightFloat.get() == nullptr \n");
}
// Back to float
weightPtr = quanCommon->weightFloat.get();
weightSize = quanCommon->weightFloat.size();
} else {
weightPtr = conv2D->weight()->data();
weightSize = conv2D->weight()->size();
}
biasSize = conv2D->bias()->size();
biasPtr = conv2D->bias()->data();
uint32_t inputCount = weightSize / (kernelX * kernelY * outputCount);
uint32_t n = outputCount;
uint32_t c = inputCount;
uint32_t h = kernelY;
uint32_t w = kernelX;
if (isDepthwise) {
n = 1;
c = outputCount;
}
nhwcWeight.reset(new float[weightSize]);
std::vector<uint32_t> weightDims {n, h, w, c};
// [outputCount, inputChannel, h, w] -> [outputCount, h, w, inputChannel]
NCHW2NHWC<float>(reinterpret_cast<const float*>(weightPtr), nhwcWeight.get(), n, c, h * w);
std::vector<uint32_t> biasDims {outputCount};
inputIdxs.push_back(buildConstant(nhwcWeight.get(), weightSize * sizeof(float), ANEURALNETWORKS_TENSOR_FLOAT32, weightDims));
inputIdxs.push_back(buildConstant(biasPtr, biasSize * sizeof(float), ANEURALNETWORKS_TENSOR_FLOAT32, biasDims));
}
// pad
inputIdxs.push_back(buildScalar(left));
inputIdxs.push_back(buildScalar(right));
inputIdxs.push_back(buildScalar(top));
inputIdxs.push_back(buildScalar(bottom));
// stride
inputIdxs.push_back(buildScalar(strideX));
inputIdxs.push_back(buildScalar(strideY));
if (isDepthwise) {
int multiplier = outputCount / group;
inputIdxs.push_back(buildScalar(multiplier));
}
// fusecode
FuseCode code = ANEURALNETWORKS_FUSED_NONE;
if (relu) code = ANEURALNETWORKS_FUSED_RELU;
if (relu6) code = ANEURALNETWORKS_FUSED_RELU6;
inputIdxs.push_back(buildScalar(code));
// NCHW/NHWC
inputIdxs.push_back(buildScalar(mNCHW));
// dilate
if (dilateX > 1 || dilateY > 1) {
inputIdxs.push_back(buildScalar(dilateX));
inputIdxs.push_back(buildScalar(dilateY));
}
auto op = ANEURALNETWORKS_CONV_2D;
if (mOp->type() == OpType_ConvolutionDepthwise) {
op = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
} else {
// TODO: deconv
}
return buildOperation(op, inputIdxs, getTensorIdxs(outputs));
}
REGISTER_NNAPI_OP_CREATOR(NNAPIConvolution, OpType_Convolution)
REGISTER_NNAPI_OP_CREATOR(NNAPIConvolution, OpType_ConvolutionDepthwise)
} // namespace MNN

View File

@ -0,0 +1,30 @@
//
// NNAPIConvolution.hpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPICONVOLUTION_HPP
#define MNN_NNAPICONVOLUTION_HPP
#include "NNAPIBackend.hpp"
#include "NNAPICommonExecution.hpp"
#include "core/ConvolutionCommon.hpp"
namespace MNN {
class NNAPIConvolution : public NNAPICommonExecution {
public:
NNAPIConvolution(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NNAPIConvolution() = default;
private:
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
std::unique_ptr<float[]> nhwcWeight;
bool isDepthwise = false, isDeconv = false;
};
} // namespace MNN
#endif // MNN_NNAPICONVOLUTION_HPP

View File

@ -0,0 +1,67 @@
//
// NNAPIPool.cpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPIPool.hpp"
namespace MNN {
NNAPIPool::NNAPIPool(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
}
ErrorCode NNAPIPool::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto pool = mOp->main_as_Pool();
auto strideX = pool->strideX();
auto strideY = pool->strideY();
auto kernelX = pool->kernelX();
auto kernelY = pool->kernelY();
auto padMod = pool->padType();
auto global = pool->isGlobal();
int top, left, bottom, right;
if (nullptr != pool->pads()) {
MNN_ASSERT(pool->pads()->size() >= 4);
top = pool->pads()->Get(0);
left = pool->pads()->Get(1);
bottom = pool->pads()->Get(2);
right = pool->pads()->Get(3);
} else {
top = pool->padY();
left = pool->padX();
bottom = pool->padY();
right = pool->padX();
}
if (global) {
kernelX = inputs[0]->width();
kernelY = inputs[0]->height();
}
// NNAPI Pool inputs: [input, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, kernel_w, kernel_h, fusecode, NCHW/NHWC]
auto inputIdxs = getTensorIdxs(inputs);
// pad
inputIdxs.push_back(buildScalar(left));
inputIdxs.push_back(buildScalar(right));
inputIdxs.push_back(buildScalar(top));
inputIdxs.push_back(buildScalar(bottom));
// stride
inputIdxs.push_back(buildScalar(strideX));
inputIdxs.push_back(buildScalar(strideY));
// kernel
inputIdxs.push_back(buildScalar(kernelX));
inputIdxs.push_back(buildScalar(kernelY));
// fusecode
inputIdxs.push_back(buildScalar(ANEURALNETWORKS_FUSED_NONE));
// NCHW/NHWC
inputIdxs.push_back(buildScalar(mNCHW));
auto op = ANEURALNETWORKS_MAX_POOL_2D;
if (pool->type() == PoolType_AVEPOOL) {
op = ANEURALNETWORKS_AVERAGE_POOL_2D;
}
return buildOperation(op, inputIdxs, getTensorIdxs(outputs));
}
REGISTER_NNAPI_OP_CREATOR(NNAPIPool, OpType_Pooling)
} // namespace MNN

View File

@ -0,0 +1,26 @@
//
// NNAPIPool.hpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPIPOOL_HPP
#define MNN_NNAPIPOOL_HPP
#include "NNAPIBackend.hpp"
#include "NNAPICommonExecution.hpp"
#include "core/ConvolutionCommon.hpp"
namespace MNN {
class NNAPIPool : public NNAPICommonExecution {
public:
NNAPIPool(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NNAPIPool() = default;
};
} // namespace MNN
#endif // MNN_NNAPIPOOL_HPP

View File

@ -0,0 +1,38 @@
//
// NNAPISoftmax.cpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPISoftmax.hpp"
namespace MNN {
NNAPISoftmax::NNAPISoftmax(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
}
ErrorCode NNAPISoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
int axis = mOp->main_as_Axis()->axis();
auto inputIdxs = getTensorIdxs(inputs);
// NNAPI Softmax inputs: [input, beta, axis]
float beta = 1.0;
inputIdxs.push_back(buildScalar(beta));
bool needAxis = false;
auto dims = inputs[0]->shape();
for (int i = 0; i < dims.size(); i++) {
if (i != axis && dims[i] > 1) {
needAxis = true;
break;
}
}
if (needAxis) {
inputIdxs.push_back(buildScalar(axis));
}
return buildOperation(ANEURALNETWORKS_SOFTMAX, inputIdxs, getTensorIdxs(outputs));
}
REGISTER_NNAPI_OP_CREATOR(NNAPISoftmax, OpType_Softmax)
} // namespace MNN

View File

@ -0,0 +1,26 @@
//
// NNAPISoftmax.hpp
// MNN
//
// Created by MNN on 2022/09/06.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPISOFTMAX_HPP
#define MNN_NNAPISOFTMAX_HPP
#include "NNAPIBackend.hpp"
#include "NNAPICommonExecution.hpp"
#include "core/ConvolutionCommon.hpp"
namespace MNN {
class NNAPISoftmax : public NNAPICommonExecution {
public:
NNAPISoftmax(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NNAPISoftmax() = default;
};
} // namespace MNN
#endif // MNN_NNAPISOFTMAX_HPP

View File

@ -0,0 +1,65 @@
//
// NNAPIUnary.cpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "NNAPIUnary.hpp"
namespace MNN {
NNAPIUnary::NNAPIUnary(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
}
ErrorCode NNAPIUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 1 && outputs.size() == 1);
std::map<UnaryOpOperation, int> unary_map {
{UnaryOpOperation_ABS, ANEURALNETWORKS_ABS},
{UnaryOpOperation_EXP, ANEURALNETWORKS_EXP},
{UnaryOpOperation_SQRT, ANEURALNETWORKS_SQRT},
{UnaryOpOperation_RSQRT, ANEURALNETWORKS_RSQRT},
{UnaryOpOperation_LOG, ANEURALNETWORKS_LOG},
{UnaryOpOperation_RECIPROCAL, -1},
{UnaryOpOperation_SIN, ANEURALNETWORKS_SIN},
{UnaryOpOperation_ASIN, -1},
{UnaryOpOperation_SINH, -1},
{UnaryOpOperation_ASINH, -1},
{UnaryOpOperation_COS, -1},
{UnaryOpOperation_ACOS, -1},
{UnaryOpOperation_COSH, -1},
{UnaryOpOperation_ACOSH, -1},
{UnaryOpOperation_TAN, -1},
{UnaryOpOperation_ATAN, -1},
{UnaryOpOperation_TANH, ANEURALNETWORKS_TANH},
{UnaryOpOperation_ATANH, -1},
{UnaryOpOperation_ERF, -1},
{UnaryOpOperation_CEIL, -1},
{UnaryOpOperation_FLOOR, ANEURALNETWORKS_FLOOR},
{UnaryOpOperation_ROUND, -1},
{UnaryOpOperation_SIGN, -1},
{UnaryOpOperation_SIGMOID, -1},
{UnaryOpOperation_LOG1P, -1},
{UnaryOpOperation_SQUARE, -1},
{UnaryOpOperation_NEG, ANEURALNETWORKS_NEG},
{UnaryOpOperation_HARDSWISH, -1},
{UnaryOpOperation_GELU, -1},
{UnaryOpOperation_GELU_STANDARD, -1},
{UnaryOpOperation_EXPM1, -1},
{UnaryOpOperation_ERFC, -1},
{UnaryOpOperation_BNLL, -1},
{UnaryOpOperation_ERFINV, -1}
};
auto opType = mOp->main_as_UnaryOp()->opType();
auto iter = unary_map.find(opType);
if (iter == unary_map.end() || iter->second < 0) {
MNN_ERROR("NNAPI Unary not support %s\n", MNN::EnumNameUnaryOpOperation(opType));
return NOT_SUPPORT;
}
return buildOperation(iter->second, getTensorIdxs(inputs), getTensorIdxs(outputs));
}
REGISTER_NNAPI_OP_CREATOR(NNAPIUnary, OpType_UnaryOp)
} // namespace MNN

View File

@ -0,0 +1,25 @@
//
// NNAPIUnary.hpp
// MNN
//
// Created by MNN on 2022/09/05.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef MNN_NNAPIUNARY_HPP
#define MNN_NNAPIUNARY_HPP
#include "NNAPIBackend.hpp"
#include "NNAPICommonExecution.hpp"
namespace MNN {
class NNAPIUnary : public NNAPICommonExecution {
public:
NNAPIUnary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
virtual ~NNAPIUnary() = default;
};
} // namespace MNN
#endif // MNN_NNAPIUNARY_HPP

View File

@ -43,6 +43,10 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
int shape[4] = {outputShape[0], outputShape[1], outputShape[2], UP_DIV(outputShape[3], 4)};
int fullCount[2] = {1, 1};
int activationType = 0;
if(mOp->type() == OpType_BinaryOp) {
activationType = mOp->main_as_BinaryOp()->activationType();
}
auto &unit = mUnits[0];
unit.kernel = runTime->buildKernel("binary_buf", "binary_buf", mBuildOptions);
mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
@ -60,6 +64,7 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
unit.kernel.setArg(index++, openCLBuffer(output));
unit.kernel.setArg(index++, shape);
unit.kernel.setArg(index++, fullCount);
unit.kernel.setArg(index++, activationType);
std::string name = "binary_buf";
mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
@ -80,6 +85,7 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
unit.kernel.setArg(index++, openCLBuffer(output));
unit.kernel.setArg(index++, shape);
unit.kernel.setArg(index++, fullCount);
unit.kernel.setArg(index++, activationType);
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};

View File

@ -7,7 +7,8 @@ __kernel void binary(__private int global_dim0, __private int global_dim1,
__read_only image2d_t input0, __read_only image2d_t input1,
__write_only image2d_t output,
__private const int4 shape,//[N,H,W,C4]
__private const int2 isFull) {
__private const int2 isFull,
__private const int activationType) {
int2 pos = (int2)(get_global_id(0), get_global_id(1));//WC4, NH
FLOAT4 in0, in1;
@ -27,6 +28,9 @@ __kernel void binary(__private int global_dim0, __private int global_dim1,
}
FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
if(activationType == 1) {
out = fmax(out, (FLOAT4)0);
}
WI_F(output, pos, out);
}
}

View File

@ -5,7 +5,8 @@
__kernel void binary_buf(__private int global_dim0, __private int global_dim1,
__global FLOAT* input0, __global FLOAT* input1, __global FLOAT* output,
__private const int4 shape,//[N,H,W,C4]
__private const int2 isFull) {
__private const int2 isFull,
__private const int activationType) {
int2 pos = (int2)(get_global_id(0), get_global_id(1));//NC4, HW
if (pos.x < global_dim0 && pos.y < global_dim1) {
@ -19,6 +20,9 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
in1 = (FLOAT4)(in1.x, in1.x, in1.x, in1.x);
}
FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
if(activationType == 1) {
out = fmax(out, (FLOAT4)0);
}
vstore4(out, offset, output);
}
}

File diff suppressed because one or more lines are too long

View File

@ -27,9 +27,8 @@ static string swapComputeIn0In1(const string& computeOrigin) {
return compute;
}
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend,
float operatorData, bool broadCast)
: CommonExecution(backend), mCompute(compute), mBroadCast(broadCast), mOperatorData(operatorData) {
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
: CommonExecution(backend), mCompute(compute) {
mBuildOptions.emplace("-DOPERATOR=" + compute);
mOp = op;
@ -56,6 +55,10 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
auto runTime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
int shape[4] = {outputShape[0], outputShape[1], outputShape[2], UP_DIV(outputShape[3], 4)};
int fullCount[2] = {1, 1};
int activationType = 0;
if(mOp->type() == OpType_BinaryOp) {
activationType = mOp->main_as_BinaryOp()->activationType();
}
auto &unit = mUnits[0];
unit.kernel = runTime->buildKernel("binary", "binary", mBuildOptions);
@ -76,6 +79,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
unit.kernel.setArg(index++, openCLImage(output));
unit.kernel.setArg(index++, shape);
unit.kernel.setArg(index++, fullCount);
unit.kernel.setArg(index++, activationType);
std::string name = "binary";
mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
@ -127,6 +131,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
unit.kernel.setArg(index++, openCLImage(output));
unit.kernel.setArg(index++, shape);
unit.kernel.setArg(index++, fullCount);
unit.kernel.setArg(index++, activationType);
if(i == 0) {
std::string name = "binary";

View File

@ -16,7 +16,7 @@ namespace OpenCL {
class EltwiseExecution : public CommonExecution {
public:
EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend, float operatorData = 0.0001f, bool broadCast = false);
EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend);
virtual ~EltwiseExecution() = default;
uint32_t realSize(const Tensor* tensor);

Some files were not shown because too many files have changed in this diff Show More