mirror of https://github.com/alibaba/MNN.git
[Sync] Sync Internal 2.1.2
This commit is contained in:
parent
ce9c57e6b5
commit
db53f951e6
|
|
@ -186,6 +186,13 @@ option(MNN_AVX512 "Enable AVX512" OFF)
|
|||
option(MNN_CUDA "Enable CUDA" OFF)
|
||||
option(MNN_TENSORRT "Enable TensorRT" OFF)
|
||||
option(MNN_COREML "Enable CoreML" OFF)
|
||||
option(MNN_NNAPI "Enable NNAPI" OFF)
|
||||
|
||||
option(MNN_CUDA_PROFILE "Enable CUDA profile" OFF)
|
||||
|
||||
if (NOT MNN_CUDA OR NOT CMAKE_SYSTEM_NAME MATCHES "^Linux")
|
||||
set(MNN_CUDA_PROFILE OFF)
|
||||
endif()
|
||||
|
||||
if (MNN_USE_THREAD_POOL)
|
||||
message(STATUS "Use Threadpool, forbid openmp")
|
||||
|
|
@ -223,12 +230,14 @@ message(STATUS "\tARM82: ${MNN_ARM82}")
|
|||
message(STATUS "\toneDNN: ${MNN_ONEDNN}")
|
||||
message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
|
||||
message(STATUS "\tCoreML: ${MNN_COREML}")
|
||||
message(STATUS "\tNNAPI: ${MNN_NNAPI}")
|
||||
message(STATUS "\tCUDA: ${MNN_CUDA}")
|
||||
message(STATUS "\tOpenMP: ${MNN_OPENMP}")
|
||||
message(STATUS "\tBF16: ${MNN_SUPPORT_BF16}")
|
||||
message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
|
||||
message(STATUS "\tHidden: ${MNN_HIDDEN}")
|
||||
message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
|
||||
message(STATUS "\tCUDA PROFILE: ${MNN_CUDA_PROFILE}")
|
||||
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
|
||||
add_definitions(-fPIC)
|
||||
|
|
@ -480,6 +489,13 @@ IF(MNN_COREML)
|
|||
list(APPEND MNN_EXTRA_DEPENDS ${VIDEO})
|
||||
ENDIF()
|
||||
|
||||
# NNAPI
|
||||
IF(MNN_NNAPI)
|
||||
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/nnapi/)
|
||||
list(APPEND MNN_DEPS MNN_NNAPI)
|
||||
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_NNAPI>)
|
||||
ENDIF()
|
||||
|
||||
# Vulkan
|
||||
IF(MNN_VULKAN)
|
||||
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/vulkan/)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,31 @@
|
|||
|
||||
|
||||
set (EXTRA_INCLUDE "")
|
||||
set (EXTRA_LIBS "")
|
||||
set (EXTRA_OPTIONS "")
|
||||
|
||||
if(MNN_CUDA_PROFILE)
|
||||
set(CUDA_MIN_VERSION "9.0")
|
||||
find_package(CUDA ${CUDA_MIN_VERSION})
|
||||
MESSAGE("benchmark message:CUDA_INCLUDE_DIRS:${CUDA_INCLUDE_DIRS}")
|
||||
link_directories(${CUDA_INCLUDE_DIRS}/../lib/
|
||||
${CUDA_INCLUDE_DIRS}/../lib64/)
|
||||
set(EXTRA_INCLUDE ${CUDA_INCLUDE_DIRS})
|
||||
set(EXTRA_LIBS -lnvToolsExt)
|
||||
set(EXTRA_OPTIONS -DMNN_CUDA_PROFILE)
|
||||
endif()
|
||||
|
||||
|
||||
add_executable(benchmark.out ${CMAKE_CURRENT_LIST_DIR}/benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/revertMNNModel.cpp)
|
||||
target_include_directories(benchmark.out PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/ ${CMAKE_CURRENT_SOURCE_DIR}/tools/)
|
||||
target_link_libraries(benchmark.out ${MNN_DEPS})
|
||||
target_include_directories(benchmark.out PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tools/cpp/ ${CMAKE_CURRENT_SOURCE_DIR}/tools/ ${EXTRA_INCLUDE})
|
||||
target_compile_options(benchmark.out PRIVATE ${EXTRA_OPTIONS})
|
||||
target_link_libraries(benchmark.out ${MNN_DEPS} ${EXTRA_LIBS})
|
||||
|
||||
file(GLOB_RECURSE SRC_FILES ${CMAKE_CURRENT_LIST_DIR}/exprModels/*.cpp)
|
||||
add_executable(benchmarkExprModels.out ${CMAKE_CURRENT_LIST_DIR}/benchmarkExprModels.cpp ${SRC_FILES})
|
||||
target_include_directories(benchmarkExprModels.out PRIVATE "${CMAKE_CURRENT_LIST_DIR}/exprModels" ${CMAKE_CURRENT_SOURCE_DIR}/)
|
||||
target_link_libraries(benchmarkExprModels.out ${MNN_DEPS})
|
||||
|
||||
|
||||
if (MSVC AND NOT MNN_BUILD_SHARED_LIBS)
|
||||
foreach (DEPEND ${MNN_DEPS})
|
||||
target_link_options(benchmark.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
|
||||
|
|
|
|||
|
|
@ -141,6 +141,7 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
|
|||
// std::vector<int> dims{1, 3, 224, 224};
|
||||
// net->resizeTensor(input, dims);
|
||||
// net->resizeSession(session);
|
||||
|
||||
net->releaseModel();
|
||||
|
||||
const MNN::Backend* inBackend = net->getBackend(session, input);
|
||||
|
|
@ -162,12 +163,9 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
|
|||
|
||||
for (int round = 0; round < loop; round++) {
|
||||
auto timeBegin = getTimeInUs();
|
||||
|
||||
void* host = input->map(MNN::Tensor::MAP_TENSOR_WRITE, input->getDimensionType());
|
||||
input->unmap(MNN::Tensor::MAP_TENSOR_WRITE, input->getDimensionType(), host);
|
||||
|
||||
net->runSession(session);
|
||||
|
||||
host = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType());
|
||||
outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ, outputTensor->getDimensionType(), host);
|
||||
auto timeEnd = getTimeInUs();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
.wy-nav-content {
|
||||
max-width: 75% !important;
|
||||
}
|
||||
|
|
@ -42,8 +42,10 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下:
|
|||
| MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` |
|
||||
| MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` |
|
||||
| MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` |
|
||||
| MNN_CUDA_PROFILE | 是否打开CUDA profile工具,默认为`OFF` |
|
||||
| MNN_TENSORRT | 是否构建`TensorRT`后端,默认为`OFF` |
|
||||
| MNN_COREML | 是否构建`CoreML`后端,默认为`OFF` |
|
||||
| MNN_NNAPI | 是否构建`NNAPI`后端,默认为`OFF` |
|
||||
| MNN_BUILD_BENCHMARK | 是否构建MNN的性能测试,默认为`OFF` |
|
||||
| MNN_BUILD_TEST | 是否构建MNN的单源测试,默认为`OFF` |
|
||||
| MNN_BUILD_FOR_ANDROID_COMMAND | 是否使用命令行构建`Android`,默认为`OFF` |
|
||||
|
|
|
|||
|
|
@ -67,5 +67,8 @@ html_theme = 'sphinx_rtd_theme'
|
|||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
html_css_files = [
|
||||
'style.css',
|
||||
]
|
||||
|
||||
needs_sphinx = '4.2.0'
|
||||
|
|
|
|||
|
|
@ -187,7 +187,7 @@ ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, void* dest,
|
|||
- `oh` 输出高度
|
||||
- `outputBpp` 如果是0,设置为保存和config.destFormat,默认为0
|
||||
- `outputStride` 如果为0,设置为ow * outputBpp,默认为0
|
||||
- `type` 只支持halide_type_of和halide_type_of,默认为halide_type_of<float>
|
||||
- `type` 支持`halide_type_of<uint8_t>`和`halide_type_of<float>`,默认为`halide_type_of<float>`
|
||||
|
||||
返回:结果code
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
## 概念说明
|
||||
`Module`接口可以用于模型训练与模型推理
|
||||
- 模型训练时用户可以继承`Module`类增加自己的实现用来训练;
|
||||
- 模型推理与`Session`的区别是不需要用户显示resize,支持控制流,所以当模型中有`if``while`时必须使用`Module`推理
|
||||
- 模型推理与`Session`的区别是不需要用户显示resize,支持控制流,所以当模型中有`if`或`while`时必须使用`Module`推理
|
||||
### 相关数据结构
|
||||
- `Module` Module接口的核心类,表示一个模型的虚类;实际加载模型时会创建其子类
|
||||
- `Executor` 包含若干个`RuntimeManager`,提供内存管理接口,每个`Executor`必须在单线程环境下运行。默认提供全局 `Executor`,需要并发执行时,可自行创建。
|
||||
|
|
@ -10,7 +10,7 @@
|
|||
- `VARP` 作为`Module`的输入输出,也是[Expr API](expr.md)中的基础数据结构
|
||||
|
||||
## 工作流程
|
||||

|
||||
创建Executor(可选) -> 创建Module -> 创建输入VARP -> 使用Module::forwad推理 -> 使用输出VARP -> 销毁Module -> 销毁Executor(可选)
|
||||
### 创建Executor
|
||||
`Executor`给用户提供接口来配置推理后端、线程数等属性,以及做性能统计、算子执行的回调函数、内存回收等功能。 提供一个全局的Exector对象,用户不用创建或持有对象即可直接使用。
|
||||
```cpp
|
||||
|
|
@ -58,7 +58,10 @@ struct Info {
|
|||
const Info* getInfo() const;
|
||||
```
|
||||
### 执行推理
|
||||
调用`onForward`执行推理
|
||||
调用`onForward`执行推理。
|
||||
|
||||
**注意:当`Module`析构之后使用`onForward`返回的`VARP`将不可用**
|
||||
|
||||
```cpp
|
||||
std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs);
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,5 +1,14 @@
|
|||
# Python API使用
|
||||
|
||||
## 安装
|
||||
MNN Python API可以使用[源码安装](../compile/pymnn.md),也可以直接使用`pip`安装预编译whl包;`pip`安装用法如下:
|
||||
```bash
|
||||
# 外部版本安装
|
||||
pip install MNN==$version
|
||||
# 公司内部版本安装
|
||||
pip install -i https://artifacts.antgroup-inc.cn/simple/ MNN-Internal==$version
|
||||
```
|
||||
|
||||
## 概览
|
||||
MNN在C++的基础上,增加了Python扩展。扩展单元包括两个部分:
|
||||
- MNN:负责推理,训练,图像处理和数值计算
|
||||
|
|
|
|||
|
|
@ -545,6 +545,8 @@ const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session
|
|||
|
||||
在只有一个输出tensor时,可以在调用`getSessionOutput`时传入NULL以获取tensor。
|
||||
|
||||
**注意:当`Session`析构之后使用`getSessionOutput`获取的`Tensor`将不可用**
|
||||
|
||||
### 拷贝数据
|
||||
**不熟悉MNN源码的用户,必须使用这种方式获取输出!!!**
|
||||
NCHW (适用于 Caffe / TorchScript / Onnx 转换而来的模型)示例:
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ Tensor是MNN V2接口中的基础数据结构,是最基本的数据封装类
|
|||
|
||||
### `Tensor(shape, dtype, value_list, dimension)`
|
||||
创建一个指定形状,数据类型, 数据和数据排布的Tensor, 数据拷贝自`value_list`,
|
||||
能够将`list`,`tuple`,`bytes`,`ndarray`,`PyCapsule`等格式的数据转换成`Tensor`
|
||||
能够将`list`,`tuple`,`bytes`,`ndarray`,`PyCapsule`,`int指针`等格式的数据转换成`Tensor`
|
||||
|
||||
*注意:`value_list`仅在PYMNN_NUMPY_USABLE打开的情况下支持`ndarray`,移动端默认关闭*
|
||||
|
||||
|
|
@ -71,7 +71,7 @@ Tensor是MNN V2接口中的基础数据结构,是最基本的数据封装类
|
|||
参数:
|
||||
- `shape:tuple` Tensor形状
|
||||
- `dtype:MNN.Halide_Type_*` Tensor数据类型
|
||||
- `value_list:ndarray/tuple/list/bytes/PyCapsule` 数据
|
||||
- `value_list:ndarray/tuple/list/bytes/PyCapsule/int_addr` 数据
|
||||
- `dimension:MNN.Tensor_DimensionType_*` 数据排布格式
|
||||
|
||||
---
|
||||
|
|
@ -176,13 +176,22 @@ Tensor是MNN V2接口中的基础数据结构,是最基本的数据封装类
|
|||
### `Example`
|
||||
|
||||
```python
|
||||
import numpy as _np
|
||||
import MNN
|
||||
import MNN.numpy as np
|
||||
data = _np.array([1., 2., 3.], dtype=_np.float32)
|
||||
# 创建Tensor
|
||||
# 通过给定的tuple创建Tensor, 参数分别为:形状,数据类型,数据,数据排布格式
|
||||
t1 = MNN.Tensor((1, 3), MNN.Halide_Type_Float, (1., 2., 3.), MNN.Tensor_DimensionType_Caffe)
|
||||
# 通过Var创建Tensor
|
||||
t2 = MNN.Tensor(np.array([1., 2., 3.])) # 与t1等价
|
||||
# 通过ndarray创建Tensor
|
||||
t3 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data, MNN.Tensor_DimensionType_Caffe)
|
||||
# 通过bytes创建Tensor
|
||||
t4 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data.tobytes(), MNN.Tensor_DimensionType_Caffe)
|
||||
# 通过int类型的内存指针创建Tensor,使用该方法比直接用ndarray速度快,但是要求ndarray的内存必须连续
|
||||
t5 = MNN.Tensor([1, 3], MNN.Halide_Type_Float, data.__array_interface__['data'][0], MNN.Tensor_DimensionType_Caffe)
|
||||
|
||||
print(t1.getShape()) # (1, 3)
|
||||
print(t1.getDataType()) # <capsule object NULL at 0x7fe01e74ff30>
|
||||
print(t1.getDimensionType()) # 1
|
||||
|
|
|
|||
|
|
@ -18,12 +18,12 @@ expr是MNN的表达式模块,包含了一系列的表达式函数能够构造M
|
|||
---
|
||||
### `const(value_list, shape, data_format, dtype)`
|
||||
根据输入数据创建一个`Const`类型的`Var`;该函数是创建的`Var`的最基本函数,
|
||||
能够将`list`,`tuple`,`bytes`,`ndarray`,`PyCapsule`等格式的数据转换成`Var`
|
||||
能够将`list`,`tuple`,`bytes`,`ndarray`,`PyCapsule`,`int指针`等格式的数据转换成`Var`
|
||||
|
||||
*注意:`value_list`仅在PYMNN_NUMPY_USABLE打开的情况下支持`ndarray`,移动端默认关闭*
|
||||
|
||||
参数:
|
||||
- `value_list:ndarray/list/tuple/bytes/PyCapsule` 输入数据
|
||||
- `value_list:ndarray/list/tuple/bytes/PyCapsule/int_addr` 输入数据
|
||||
- `shape:[int]` 构造`Var`的形状
|
||||
- `data_format:data_format` 数据排布格式,参考[data_format](Var.html#data-format)
|
||||
- `dtype:dtype` 数据类型,参考[dtype](Var.html#dtype)
|
||||
|
|
@ -44,6 +44,8 @@ array([2, 3, 4], dtype=int32)
|
|||
array([97, 98, 99], dtype=uint8)
|
||||
>>> expr.const(MNN.Tensor([2, 3]).getData(), [2], expr.NCHW, expr.int) # PyCapsule
|
||||
array([2, 3], dtype=int32)
|
||||
>>> expr.const(np.arange(4.0).astype(np.float32).__array_interface__['data'][0], [4], expr.NCHW, expr.float) # int_addr 该方法要求ndarray内存必须连续
|
||||
array([0., 1., 2., 3.], dtype=float32)
|
||||
```
|
||||
---
|
||||
### `set_thread_number(numberThread)`
|
||||
|
|
@ -1336,6 +1338,94 @@ array(0)
|
|||
array([0, 1])
|
||||
```
|
||||
|
||||
---
|
||||
### `eltwise_prod(x, y, coeff)`
|
||||
逐元素对输入的变量执行乘法运算
|
||||
|
||||
参数:
|
||||
- `x:Var_like` 输入变量
|
||||
- `y:Var_like` 输入变量
|
||||
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
|
||||
|
||||
返回:`x*y`, 当`coeff=[1.,0.]`时返回`x`
|
||||
|
||||
返回类型:`Var`
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
>>> expr.eltwise_prod([1., 2., 3.], [2., 2., 2.], [])
|
||||
array([2., 4., 6.], dtype=float32)
|
||||
>>> expr.eltwise_prod([1., 2., 3.], [2., 2., 2.], [1., 0.])
|
||||
array([1., 2., 3.], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `eltwise_sum(x, y, coeff)`
|
||||
逐元素对输入的变量执行加法运算
|
||||
|
||||
参数:
|
||||
- `x:Var_like` 输入变量
|
||||
- `y:Var_like` 输入变量
|
||||
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
|
||||
|
||||
返回:`x+y`, 当`coeff=[1.,0.]`时返回`x`
|
||||
|
||||
返回类型:`Var`
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
>>> expr.eltwise_sum([1., 2., 3.], [2., 2., 2.], [])
|
||||
array([3., 4., 5.], dtype=float32)
|
||||
>>> expr.eltwise_sum([1., 2., 3.], [2., 2., 2.], [1., 0.])
|
||||
array([1., 2., 3.], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `eltwise_sub(x, y, coeff)`
|
||||
逐元素对输入的变量执行减法运算
|
||||
|
||||
参数:
|
||||
- `x:Var_like` 输入变量
|
||||
- `y:Var_like` 输入变量
|
||||
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
|
||||
|
||||
返回:`x-y`, 当`coeff=[1.,0.]`时返回`x`
|
||||
|
||||
返回类型:`Var`
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
>>> expr.eltwise_sub([1., 2., 3.], [2., 2., 2.], [])
|
||||
array([-1., 0., 1.], dtype=float32)
|
||||
>>> expr.eltwise_sub([1., 2., 3.], [2., 2., 2.], [1., 0.])
|
||||
array([1., 2., 3.], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `eltwise_max(x, y, coeff)`
|
||||
逐元素对输入的变量执行比较运算,取最大值
|
||||
|
||||
参数:
|
||||
- `x:Var_like` 输入变量
|
||||
- `y:Var_like` 输入变量
|
||||
- `coeff:[float]` 系数,目前仅支持`[1.,0.]`或`[]/[0.]`
|
||||
|
||||
返回:`max(x,y)`, 当`coeff=[1.,0.]`时返回`x`
|
||||
|
||||
返回类型:`Var`
|
||||
|
||||
示例:
|
||||
|
||||
```python
|
||||
>>> expr.eltwise_max([1., 2., 3.], [2., 2., 2.], [])
|
||||
array([2., 2., 3.], dtype=float32)
|
||||
>>> expr.eltwise_max([1., 2., 3.], [2., 2., 2.], [1., 0.])
|
||||
array([1., 2., 3.], dtype=float32)
|
||||
```
|
||||
|
||||
---
|
||||
### `cast(x, dtype=_F.float)`
|
||||
返回输入数的dtype
|
||||
|
|
|
|||
|
|
@ -1,3 +1,17 @@
|
|||
|
||||
|
||||
set (EXTRA_LIBS "")
|
||||
if(MNN_CUDA_PROFILE)
|
||||
set(CUDA_MIN_VERSION "9.0")
|
||||
find_package(CUDA ${CUDA_MIN_VERSION})
|
||||
|
||||
MESSAGE("cuda dir is:${CUDA_INCLUDE_DIRS}")
|
||||
include_directories(
|
||||
${CUDA_INCLUDE_DIRS}
|
||||
)
|
||||
set(EXTRA_LIBS ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
|
||||
endif()
|
||||
|
||||
file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
|
||||
option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
|
||||
option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
|
||||
|
|
@ -17,7 +31,7 @@ IF(MNN_SEP_BUILD)
|
|||
else()
|
||||
add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
|
||||
endif()
|
||||
target_link_libraries(MNN_Express MNN)
|
||||
target_link_libraries(MNN_Express MNN ${EXTRA_LIBS})
|
||||
install(TARGETS MNN_Express
|
||||
LIBRARY DESTINATION lib
|
||||
ARCHIVE DESTINATION lib
|
||||
|
|
@ -25,3 +39,5 @@ IF(MNN_SEP_BUILD)
|
|||
ELSE()
|
||||
add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS})
|
||||
ENDIF()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -201,8 +201,9 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
|
|||
info.type = type;
|
||||
info.numThread = numberThread;
|
||||
info.user = const_cast<BackendConfig*>(&config);
|
||||
std::shared_ptr<Runtime> bn(creator->onCreate(info));
|
||||
return std::shared_ptr<Executor>(new Executor(bn, type, numberThread));
|
||||
std::shared_ptr<Runtime> runtime(creator->onCreate(info));
|
||||
auto executor = new Executor(runtime, type, numberThread);
|
||||
return std::shared_ptr<Executor>(executor);
|
||||
}
|
||||
|
||||
RuntimeInfo Executor::getRuntime() {
|
||||
|
|
|
|||
|
|
@ -638,6 +638,7 @@ void* Variable::readInternal(bool forShape) {
|
|||
return Executor::mapOutput(cache.get(), mFrom->mInside->mCacheOffset + mFromIndex, mFrom->mInside->mOutputTensors[mFromIndex]);
|
||||
}
|
||||
|
||||
|
||||
void Variable::informDirty() {
|
||||
std::vector<Expr*> visited;
|
||||
mFrom->visitOutputs([&visited](EXPRP expr, int index) {
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include "IfModule.hpp"
|
||||
#include "MNN_generated.h"
|
||||
|
||||
namespace MNN {
|
||||
namespace Express {
|
||||
static int _findPos(const std::vector<std::string>& names, const std::string& key) {
|
||||
|
|
@ -21,6 +22,8 @@ static int _findPos(const std::vector<std::string>& names, const std::string& ke
|
|||
std::vector<Express::VARP> IfModule::onForward(const std::vector<Express::VARP>& inputs) {
|
||||
std::vector<Express::VARP> outputs(mOutputFromElse.size());
|
||||
MNN_ASSERT(mOutputFromThen.size() == mOutputFromElse.size());
|
||||
|
||||
|
||||
if (inputs[0]->readMap<int>()[0] > 0) {
|
||||
std::vector<Express::VARP> subInputs(mInputForThen.size());
|
||||
for (auto& p : mInputForThen) {
|
||||
|
|
@ -54,12 +57,14 @@ IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>&
|
|||
if (nullptr != op->name()) {
|
||||
module->setName(op->name()->str());
|
||||
}
|
||||
|
||||
|
||||
/** Compute map index
|
||||
std::vector<std::pair<int, int>> mInputForThen;
|
||||
|
||||
// First mElse' index, Second: inputs's index
|
||||
std::vector<std::pair<int, int>> mInputForElse;
|
||||
|
||||
|
||||
std::vector<int> mOutputFromThen;
|
||||
std::vector<int> mOutputFromElse;
|
||||
*/
|
||||
|
|
@ -98,7 +103,7 @@ IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>&
|
|||
for (int i=0; i<output->size(); ++i) {
|
||||
auto data = output->GetAs<StringVec>(i);
|
||||
MNN_ASSERT(data->data()->size() == 2);
|
||||
|
||||
|
||||
auto thenPos = _findPos(thenG.outputs, data->data()->GetAsString(0)->str());
|
||||
MNN_ASSERT(thenPos >= 0);
|
||||
auto elsePos = _findPos(elseG.outputs, data->data()->GetAsString(1)->str());
|
||||
|
|
|
|||
|
|
@ -9,9 +9,9 @@
|
|||
#include <MNN/expr/Module.hpp>
|
||||
#include <MNN/expr/ExprCreator.hpp>
|
||||
#include <MNN/expr/ExecutorScope.hpp>
|
||||
|
||||
#include "PipelineModule.hpp"
|
||||
#include "core/FileLoader.hpp"
|
||||
#include "backend/cpu/CPUBackend.hpp"
|
||||
#include "MNN_generated.h"
|
||||
#include "Utils.hpp"
|
||||
#include "RuntimeAttr.hpp"
|
||||
|
|
@ -202,15 +202,24 @@ public:
|
|||
}
|
||||
}
|
||||
#endif // MNN_INTERNAL_ENABLED
|
||||
if (nullptr == mInfo->runTimeManager.get()) {
|
||||
mRuntime = Executor::getRuntime().second;
|
||||
} else {
|
||||
mRuntime = mInfo->runTimeManager->getInside()->mRuntime.first.begin()->second;
|
||||
}
|
||||
|
||||
}
|
||||
virtual ~ NetModule(){}
|
||||
|
||||
virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
|
||||
|
||||
#ifdef MNN_INTERNAL_ENABLED
|
||||
Timer _time;
|
||||
auto glo = ExecutorScope::Current();
|
||||
glo->getDebugTools()->flops = 0.0f;
|
||||
#endif
|
||||
CPURuntime* runtime = static_cast<CPURuntime*>(mRuntime.get());
|
||||
runtime->clearReuseCopyTensorMap();
|
||||
auto outputs = mModule->onForward(inputs);
|
||||
#ifdef MNN_INTERNAL_ENABLED
|
||||
do {
|
||||
|
|
@ -237,10 +246,18 @@ public:
|
|||
} while(false);
|
||||
#endif
|
||||
return outputs;
|
||||
|
||||
}
|
||||
void setRuntime(std::shared_ptr<Runtime> runtime) {
|
||||
mRuntime = runtime;
|
||||
}
|
||||
|
||||
virtual Module* clone(CloneContext* ctx) const override {
|
||||
std::shared_ptr<Module> submodule(mModule->clone(ctx));
|
||||
|
||||
NetModule* module(new NetModule(submodule, mInfo, nullptr, 0, 0.0f));
|
||||
module->setRuntime(Executor::getRuntime().second);
|
||||
|
||||
#ifdef MNN_INTERNAL_ENABLED
|
||||
module->mLogInfo = mLogInfo;
|
||||
#endif
|
||||
|
|
@ -249,9 +266,11 @@ public:
|
|||
const Module::Info* info() const {
|
||||
return mInfo.get();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<Module> mModule;
|
||||
std::shared_ptr<Module::Info> mInfo;
|
||||
std::shared_ptr<Runtime> mRuntime = nullptr;
|
||||
#ifdef MNN_INTERNAL_ENABLED
|
||||
std::map<std::string, std::string> mLogInfo;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -250,6 +250,7 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
|
|||
if (!res) {
|
||||
return;
|
||||
}
|
||||
|
||||
mResource->mUseContentInputs = scheduleInfo.needInputContentForShape;
|
||||
if (mResource->mUseContentInputs) {
|
||||
mResource->mModes.inputMode = Interpreter::Session_Input_User;
|
||||
|
|
@ -262,6 +263,9 @@ StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<
|
|||
mInputTensors.resize(inputs.size());
|
||||
for (int i = 0; i < inputs.size(); ++i) {
|
||||
mInputTensors[i] = mSession->getInput(inputs[i].c_str());
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("init Staticmodule %d th input ptr:%p, hostPtr:%p, name:%s\n", i, mInputTensors[i], mInputTensors[i]->host<void>(), inputs[i].c_str());
|
||||
#endif
|
||||
}
|
||||
mOutputTensors.resize(mResource->mOutputFromTensor.size());
|
||||
for (int i = 0; i < mResource->mOutputFromTensor.size(); ++i) {
|
||||
|
|
@ -274,6 +278,7 @@ StaticModule::~StaticModule() {
|
|||
mBackupResourceBackend = nullptr;
|
||||
}
|
||||
std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
|
||||
|
||||
AUTOTIME;
|
||||
std::vector<Express::VARP> outputs(mResource->mOutputNumbers);
|
||||
for (auto& iter : mResource->mOutputFromInput) {
|
||||
|
|
@ -315,6 +320,18 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
}
|
||||
mInputTensors[i]->buffer().host = inputTensor->buffer().host;
|
||||
mInputTensors[i]->buffer().device = inputTensor->buffer().device;
|
||||
|
||||
if (mResource->mUseContentInputs) {
|
||||
|
||||
if (nullptr == mInputTensors[i]->buffer().host && 0 != mInputTensors[i]->buffer().device ) {
|
||||
|
||||
auto exprInfo = inputs[i]->expr();
|
||||
auto inside = exprInfo.first->inside();
|
||||
auto srcPtr = inputs[i]->readMap<void>();
|
||||
mInputTensors[i]->buffer().host = inside->mHostTensor->buffer().host;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
if (mResource->mUseContentInputs) {
|
||||
mSession->setNeedResize();
|
||||
|
|
@ -345,7 +362,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
continue;
|
||||
}
|
||||
auto exprInfo = inputs[i]->expr();
|
||||
auto inside = exprInfo.first->inside();
|
||||
auto inside = exprInfo.first->inside();
|
||||
auto inputTensor = inside->mOutputTensors[exprInfo.second];
|
||||
if (nullptr != inside->mCache) {
|
||||
inputTensor = Executor::getOutput(inside->mCache.get(), inside->mCacheOffset);
|
||||
|
|
@ -353,6 +370,19 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
mInputTensors[i]->copyFromHostTensor(inputTensor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
for (auto& inputTensor : mInputTensors) {
|
||||
MNN_PRINT("static module, before run, input ptr:%p, hostPtr:%p, shape:", inputTensor, inputTensor->host<void>());
|
||||
inputTensor->printShape();
|
||||
MNN_PRINT("\n");
|
||||
auto shape = inputTensor->shape();
|
||||
}
|
||||
MNN_PRINT("staticmodule before run\n");
|
||||
#endif
|
||||
|
||||
|
||||
ErrorCode code;
|
||||
if (mResource->mModes.callBackMode == Interpreter::Session_Debug) {
|
||||
auto globalExecutor = ExecutorScope::Current();
|
||||
|
|
@ -372,6 +402,8 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
|
|||
auto tensor = Tensor::clone(mOutputTensors[i]);
|
||||
outputs[mResource->mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(tensor, true));
|
||||
}
|
||||
|
||||
|
||||
#ifdef MNN_INTERNAL_ENABLED
|
||||
auto glo = ExecutorScope::Current();
|
||||
float flops = 0.0f;
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ protected:
|
|||
|
||||
typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opName*/)> TensorCallBack;
|
||||
typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
|
||||
typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;
|
||||
typedef std::pair< std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;
|
||||
|
||||
/**
|
||||
* @brief get mnn version info.
|
||||
|
|
@ -328,7 +328,7 @@ public:
|
|||
|
||||
/** Backends in session in M, int*, length >= 1 + number of configs when create session */
|
||||
BACKENDS = 2,
|
||||
|
||||
|
||||
/** Resize Info, int*, 0: ready to execute, 1: need malloc, 2: need resize */
|
||||
RESIZE_STATUS = 3,
|
||||
|
||||
|
|
|
|||
|
|
@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
|
|||
#define STR(x) STR_IMP(x)
|
||||
#define MNN_VERSION_MAJOR 2
|
||||
#define MNN_VERSION_MINOR 1
|
||||
#define MNN_VERSION_PATCH 1
|
||||
#define MNN_VERSION_PATCH 2
|
||||
#define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
|
||||
#endif /* MNNDefine_h */
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ public:
|
|||
/** string handle type */
|
||||
HANDLE_STRING = 1
|
||||
};
|
||||
|
||||
|
||||
/** Tensor map type : Read or Write*/
|
||||
enum MapType {
|
||||
/** map Tensor for writing data*/
|
||||
|
|
@ -131,7 +131,7 @@ public:
|
|||
* @param deepCopy whether create new content and copy, currently only support deepCopy = false
|
||||
*/
|
||||
static Tensor* clone(const Tensor* src, bool deepCopy = false);
|
||||
|
||||
|
||||
/**
|
||||
* @brief delete tensor.
|
||||
* @param src tensor
|
||||
|
|
@ -279,7 +279,7 @@ public:
|
|||
* @brief print tensor data. for DEBUG use only.
|
||||
*/
|
||||
void print() const;
|
||||
|
||||
|
||||
/**
|
||||
*@brief print tensor shape
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -147,7 +147,7 @@ public:
|
|||
static void save(const std::vector<VARP>& vars, const char* fileName);
|
||||
static std::vector<int8_t> save(const std::vector<VARP>& vars);
|
||||
static void save(const std::vector<VARP>& vars, NetT* dest);
|
||||
|
||||
|
||||
// Pack a few Variable to compute in one pipeline
|
||||
static void prepareCompute(const std::vector<VARP>& vars, bool forceCPU = false);
|
||||
static void compute(const std::vector<VARP>& vars, bool forceCPU = false);
|
||||
|
|
@ -158,6 +158,7 @@ public:
|
|||
mFrom = expr;
|
||||
mFromIndex = index;
|
||||
}
|
||||
|
||||
private:
|
||||
Variable(EXPRP expr, int index) {
|
||||
mFrom = expr;
|
||||
|
|
|
|||
|
|
@ -736,6 +736,9 @@
|
|||
C4F906B327688C3A0026B847 /* NMSModule.hpp in Headers */ = {isa = PBXBuildFile; fileRef = C4F906B127688C3A0026B847 /* NMSModule.hpp */; };
|
||||
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4F906B227688C3A0026B847 /* NMSModule.cpp */; };
|
||||
C4FB6CB22769DF0800963B07 /* GeometryCumSum.cpp in Sources */ = {isa = PBXBuildFile; fileRef = C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */; };
|
||||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
|
||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */; };
|
||||
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */; };
|
||||
CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
|
||||
CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
|
||||
CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
|
||||
|
|
@ -1520,6 +1523,9 @@
|
|||
C4F906B127688C3A0026B847 /* NMSModule.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = NMSModule.hpp; sourceTree = "<group>"; };
|
||||
C4F906B227688C3A0026B847 /* NMSModule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = NMSModule.cpp; sourceTree = "<group>"; };
|
||||
C4FB6CB12769DF0800963B07 /* GeometryCumSum.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = GeometryCumSum.cpp; sourceTree = "<group>"; };
|
||||
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
|
||||
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUInterp3D.cpp; sourceTree = "<group>"; };
|
||||
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUInterp3D.hpp; sourceTree = "<group>"; };
|
||||
CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
|
||||
CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
|
||||
|
|
@ -1818,6 +1824,8 @@
|
|||
48887410215B639D0079B12E /* cpu */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CE9AFED428E54E3300566949 /* CPUInterp3D.cpp */,
|
||||
CE9AFED528E54E3300566949 /* CPUInterp3D.hpp */,
|
||||
4DCF538B2892B16300B5B393 /* CPUHistogram.cpp */,
|
||||
4DCF538C2892B16400B5B393 /* CPUHistogram.hpp */,
|
||||
4DF87C512887D3F20003E2D4 /* CPUSvd.hpp */,
|
||||
|
|
@ -2642,6 +2650,7 @@
|
|||
EBB38EC621E748B9005F76D7 /* shape */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */,
|
||||
4DCF538F2892B17000B5B393 /* ShapeHistogram.cpp */,
|
||||
4DF87C4D2887D39B0003E2D4 /* ShapeSvd.cpp */,
|
||||
48925F362744AC2A00919B37 /* ShapeROIAlign.cpp */,
|
||||
|
|
@ -2871,6 +2880,7 @@
|
|||
92FF03BD23AA0B5A00AC97F6 /* Int8FunctionsOpt.h in Headers */,
|
||||
92FF036623AA0B5A00AC97F6 /* CPUDetectionOutput.hpp in Headers */,
|
||||
92FF04BC23AA0BFB00AC97F6 /* NonCopyable.hpp in Headers */,
|
||||
CE9AFED728E54E3300566949 /* CPUInterp3D.hpp in Headers */,
|
||||
48FA474B23AA127B00172C3B /* Utils.hpp in Headers */,
|
||||
4A224A1427D0C56E000A9260 /* ConvolutionWinogradBridge.hpp in Headers */,
|
||||
4D9A935926255BDA00F9B43C /* DataStructures.pb-c.h in Headers */,
|
||||
|
|
@ -3422,6 +3432,7 @@
|
|||
4D0C80E52862FC4700C7CAD6 /* CoreMLRaster.metal in Sources */,
|
||||
92FF044123AA0B7100AC97F6 /* ShapeMoments.cpp in Sources */,
|
||||
4D9A936026255BDA00F9B43C /* Model.pb-c.c in Sources */,
|
||||
CE9AFED628E54E3300566949 /* CPUInterp3D.cpp in Sources */,
|
||||
92FF03AB23AA0B5A00AC97F6 /* ConvolutionInt8Executor.cpp in Sources */,
|
||||
C4F906B427688C3A0026B847 /* NMSModule.cpp in Sources */,
|
||||
48FA474523AA127B00172C3B /* Executor.cpp in Sources */,
|
||||
|
|
@ -3483,6 +3494,7 @@
|
|||
4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */,
|
||||
92FF02B623AA0B5A00AC97F6 /* CPUUnary.cpp in Sources */,
|
||||
92FF032723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
|
||||
CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */,
|
||||
92FF02CA23AA0B5A00AC97F6 /* MNNUnPackC4.S in Sources */,
|
||||
48925F372744AC2A00919B37 /* ShapeROIAlign.cpp in Sources */,
|
||||
92FF02E723AA0B5A00AC97F6 /* MNNDeconvRunForUnitDepthWise.S in Sources */,
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ def build_deps():
|
|||
extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
|
||||
extra_opts += ' -DMNN_BUILD_TORCH=ON ' if IS_BUILD_TORCH else ' '
|
||||
os.system('cmake ' + extra_opts +
|
||||
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Debug \
|
||||
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release \
|
||||
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
|
||||
-DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4')
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -1508,7 +1508,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
|
|||
break;
|
||||
case 4:
|
||||
parse_res = PyArg_ParseTuple(args, "OOOl", &shape, &dataType, &data, &dimensionType)
|
||||
&& isInts(shape) && isVals(data);
|
||||
&& isInts(shape) && (isVals(data) || isInt(data));
|
||||
break;
|
||||
default:
|
||||
parse_res = false;
|
||||
|
|
@ -1518,7 +1518,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
|
|||
"\t0. (Var)\n"
|
||||
"\t1. (Tensor/Var, DimensionType)\n"
|
||||
"\t2. ([int], DataType, DimensionType)\n"
|
||||
"\t3. ([int], DataType, tuple/ndarray, DimensionType)\n");
|
||||
"\t3. ([int], DataType, ndarray/list/tuple/bytes/PyCapsule/int_addr, DimensionType)\n");
|
||||
return -1;
|
||||
}
|
||||
#ifdef PYMNN_EXPR_API
|
||||
|
|
@ -1589,7 +1589,7 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
|
|||
dataSize *= i;
|
||||
}
|
||||
void *pData = NULL;
|
||||
if (data && !PyCapsule_CheckExact(data)) {
|
||||
if (data && !PyCapsule_CheckExact(data) && !isInt(data)) {
|
||||
if (PyBytes_Check(data)) {
|
||||
int64_t total_len = PyBytes_Size(data);
|
||||
if (dataSize * itemsize != total_len) {
|
||||
|
|
@ -1626,15 +1626,20 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
|
|||
}
|
||||
} else {
|
||||
// no data input, set all zeros
|
||||
// pycapsule input, copy data
|
||||
// pycapsule/int_addr input, copy data
|
||||
pData = malloc(dataSize * itemsize);
|
||||
if (data && PyCapsule_CheckExact(data)) {
|
||||
auto src = PyCapsule_GetPointer(data, NULL);
|
||||
if (src == nullptr) {
|
||||
PyMNN_ERROR_LOG("PyMNNTensor_init: PyCapsule pointer is null.");
|
||||
if (data) {
|
||||
void* srcPtr = nullptr;
|
||||
if (PyCapsule_CheckExact(data)) {
|
||||
srcPtr = PyCapsule_GetPointer(data, NULL);
|
||||
} else {
|
||||
srcPtr = PyLong_AsVoidPtr(data);
|
||||
}
|
||||
if (srcPtr == nullptr) {
|
||||
PyMNN_ERROR_LOG("PyMNNTensor_init: PyCapsule/int_addr pointer is null.");
|
||||
return -1;
|
||||
}
|
||||
memcpy(pData, src, dataSize * itemsize);
|
||||
memcpy(pData, srcPtr, dataSize * itemsize);
|
||||
} else {
|
||||
memset(pData, 0, dataSize * itemsize);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1243,10 +1243,10 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
|
|||
PyObject *value, *shapes, *format = nullptr /* NCHW */, *type = nullptr /* DType_FLOAT */;
|
||||
static char *kwlist[] = { "value_list", "shape", "data_format", "dtype", NULL };
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|OO", kwlist, &value, &shapes, &format, &type)) {
|
||||
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule, [ints], |data_format, dtype)");
|
||||
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule/int_addr, [ints], |data_format, dtype)");
|
||||
}
|
||||
if (!isVals(value) || !isInts(shapes) || (format != nullptr && !isdata_format(format)) || (type != nullptr && !isdtype(type))) {
|
||||
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule, [ints], |data_format, dtype)");
|
||||
if ((!isVals(value) && !isInt(value)) || !isInts(shapes) || (format != nullptr && !isdata_format(format)) || (type != nullptr && !isdtype(type))) {
|
||||
PyMNN_ERROR("const require args: (ndarray/list/tuple/bytes/PyCapsule/int_addr, [ints], |data_format, dtype)");
|
||||
}
|
||||
auto data_format = (format == nullptr ? NCHW : toEnum<Dimensionformat>(format));
|
||||
auto dtype = (type == nullptr ? DType_FLOAT : toEnum<DType>(type));
|
||||
|
|
@ -1268,6 +1268,8 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
|
|||
bool need_free = false;
|
||||
if (PyCapsule_CheckExact(value)) {
|
||||
data = PyCapsule_GetPointer(value, NULL);
|
||||
} else if (isInt(value)) {
|
||||
data = PyLong_AsVoidPtr(value);
|
||||
} else if (PyBytes_Check(value)) {
|
||||
int64_t bytesize = PyBytes_Size(value);
|
||||
data = toPtr(value, DType_UINT8, bytesize);
|
||||
|
|
|
|||
|
|
@ -77,6 +77,8 @@ class UnitTest(unittest.TestCase):
|
|||
self.assertEqualArray(x.getNumpyData(), data)
|
||||
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, data.tobytes(), MNN.Tensor_DimensionType_Tensorflow)
|
||||
self.assertEqualArray(x.getNumpyData(), data)
|
||||
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, data.__array_interface__['data'][0], MNN.Tensor_DimensionType_Tensorflow)
|
||||
self.assertEqualArray(x.getNumpyData(), data)
|
||||
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, mp.array([[1., 2.], [3., 4.]]).ptr, MNN.Tensor_DimensionType_Tensorflow)
|
||||
self.assertEqualArray(x.getNumpyData(), data)
|
||||
def test_image_process(self):
|
||||
|
|
@ -274,6 +276,8 @@ class UnitTest(unittest.TestCase):
|
|||
self.assertEqualVar(expr.const(list_data, [2, 2]), data)
|
||||
self.assertEqualVar(expr.const(tuple_data, [2, 2]), data)
|
||||
self.assertEqualVar(expr.const(data, [2, 2]), data)
|
||||
self.assertEqualVar(expr.const(data.tobytes(), [2, 2]), data)
|
||||
self.assertEqualVar(expr.const(data.__array_interface__['data'][0], [2, 2]), data)
|
||||
x = MNN.Tensor([2, 2], MNN.Halide_Type_Float, (1., 2., 3., 4.), MNN.Tensor_DimensionType_Tensorflow)
|
||||
self.assertEqualVar(expr.const(x.getHost(), [2, 2]), data)
|
||||
def test_conv2d(self):
|
||||
|
|
|
|||
|
|
@ -586,9 +586,11 @@ struct BinaryOpT : public flatbuffers::NativeTable {
|
|||
typedef BinaryOp TableType;
|
||||
int32_t opType;
|
||||
DataType T;
|
||||
int32_t activationType;
|
||||
BinaryOpT()
|
||||
: opType(0),
|
||||
T(DataType_DT_FLOAT) {
|
||||
T(DataType_DT_FLOAT),
|
||||
activationType(0) {
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -603,10 +605,14 @@ struct BinaryOp FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
|
|||
DataType T() const {
|
||||
return static_cast<DataType>(GetField<int32_t>(6, 1));
|
||||
}
|
||||
int32_t activationType() const {
|
||||
return GetField<int32_t>(8, 0);
|
||||
}
|
||||
bool Verify(flatbuffers::Verifier &verifier) const {
|
||||
return VerifyTableStart(verifier) &&
|
||||
VerifyField<int32_t>(verifier, 4) &&
|
||||
VerifyField<int32_t>(verifier, 6) &&
|
||||
VerifyField<int32_t>(verifier, 8) &&
|
||||
verifier.EndTable();
|
||||
}
|
||||
BinaryOpT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
|
||||
|
|
@ -623,6 +629,9 @@ struct BinaryOpBuilder {
|
|||
void add_T(DataType T) {
|
||||
fbb_.AddElement<int32_t>(6, static_cast<int32_t>(T), 1);
|
||||
}
|
||||
void add_activationType(int32_t activationType) {
|
||||
fbb_.AddElement<int32_t>(8, activationType, 0);
|
||||
}
|
||||
explicit BinaryOpBuilder(flatbuffers::FlatBufferBuilder &_fbb)
|
||||
: fbb_(_fbb) {
|
||||
start_ = fbb_.StartTable();
|
||||
|
|
@ -638,8 +647,10 @@ struct BinaryOpBuilder {
|
|||
inline flatbuffers::Offset<BinaryOp> CreateBinaryOp(
|
||||
flatbuffers::FlatBufferBuilder &_fbb,
|
||||
int32_t opType = 0,
|
||||
DataType T = DataType_DT_FLOAT) {
|
||||
DataType T = DataType_DT_FLOAT,
|
||||
int32_t activationType = 0) {
|
||||
BinaryOpBuilder builder_(_fbb);
|
||||
builder_.add_activationType(activationType);
|
||||
builder_.add_T(T);
|
||||
builder_.add_opType(opType);
|
||||
return builder_.Finish();
|
||||
|
|
@ -3481,6 +3492,7 @@ inline void BinaryOp::UnPackTo(BinaryOpT *_o, const flatbuffers::resolver_functi
|
|||
(void)_resolver;
|
||||
{ auto _e = opType(); _o->opType = _e; };
|
||||
{ auto _e = T(); _o->T = _e; };
|
||||
{ auto _e = activationType(); _o->activationType = _e; };
|
||||
}
|
||||
|
||||
inline flatbuffers::Offset<BinaryOp> BinaryOp::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BinaryOpT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
|
||||
|
|
@ -3493,10 +3505,12 @@ inline flatbuffers::Offset<BinaryOp> CreateBinaryOp(flatbuffers::FlatBufferBuild
|
|||
struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BinaryOpT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
|
||||
auto _opType = _o->opType;
|
||||
auto _T = _o->T;
|
||||
auto _activationType = _o->activationType;
|
||||
return MNN::CreateBinaryOp(
|
||||
_fbb,
|
||||
_opType,
|
||||
_T);
|
||||
_T,
|
||||
_activationType);
|
||||
}
|
||||
|
||||
inline PackParamT *PackParam::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
|
||||
|
|
@ -4932,17 +4946,19 @@ inline const flatbuffers::TypeTable *PadValueModeTypeTable() {
|
|||
inline const flatbuffers::TypeTable *BinaryOpTypeTable() {
|
||||
static const flatbuffers::TypeCode type_codes[] = {
|
||||
{ flatbuffers::ET_INT, 0, -1 },
|
||||
{ flatbuffers::ET_INT, 0, 0 }
|
||||
{ flatbuffers::ET_INT, 0, 0 },
|
||||
{ flatbuffers::ET_INT, 0, -1 }
|
||||
};
|
||||
static const flatbuffers::TypeFunction type_refs[] = {
|
||||
DataTypeTypeTable
|
||||
};
|
||||
static const char * const names[] = {
|
||||
"opType",
|
||||
"T"
|
||||
"T",
|
||||
"activationType"
|
||||
};
|
||||
static const flatbuffers::TypeTable tt = {
|
||||
flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, names
|
||||
flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, names
|
||||
};
|
||||
return &tt;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@ enum BinaryOpOperation : byte {
|
|||
table BinaryOp {
|
||||
opType:int;
|
||||
T:DataType=DT_FLOAT;
|
||||
// 0 -> No Activation
|
||||
// 1 -> Relu
|
||||
activationType:int=0;
|
||||
}
|
||||
|
||||
table PackParam {
|
||||
|
|
|
|||
|
|
@ -182,12 +182,43 @@ ErrorCode CoreMLBinary::onResize(const std::vector<Tensor *> &inputs, const std:
|
|||
MNN_ERROR("NPU Binary not support %s\n", MNN::EnumNameBinaryOpOperation(binaryType));
|
||||
break;
|
||||
}
|
||||
if (oneInput) {
|
||||
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(input)}, {mCoreMLBackend->getTensorName(outputs[0])});
|
||||
|
||||
std::string binartInputName;
|
||||
if(oneInput) {
|
||||
binartInputName = mCoreMLBackend->getTensorName(input);
|
||||
} else {
|
||||
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0]), mCoreMLBackend->getTensorName(inputs[1])}, {mCoreMLBackend->getTensorName(outputs[0])});
|
||||
binartInputName = mCoreMLBackend->getTensorName(inputs[0]);
|
||||
}
|
||||
std::string binaryOutputName = mCoreMLBackend->getTensorName(outputs[0]);
|
||||
int activationType = 0;
|
||||
if(mOp->type() == OpType_BinaryOp) {
|
||||
activationType = mOp->main_as_BinaryOp()->activationType();
|
||||
}
|
||||
if (activationType == 1) {
|
||||
binaryOutputName = binartInputName + "-" + binaryOutputName + "-Relu";
|
||||
}
|
||||
|
||||
if (oneInput) {
|
||||
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(input)}, {binaryOutputName});
|
||||
} else {
|
||||
setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0]), mCoreMLBackend->getTensorName(inputs[1])}, {binaryOutputName});
|
||||
}
|
||||
mCoreMLBackend->addLayer(mLayer_);
|
||||
|
||||
if (activationType == 1) {
|
||||
auto reluLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
|
||||
core_ml__specification__neural_network_layer__init(reluLayer);
|
||||
mCoreMLBackend->setLayerName(reluLayer, "BinaryRelu");
|
||||
reluLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ACTIVATION;
|
||||
reluLayer->activation = mCoreMLBackend->create<CoreML__Specification__ActivationParams>();
|
||||
core_ml__specification__activation_params__init(reluLayer->activation);
|
||||
reluLayer->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_RE_LU;
|
||||
reluLayer->activation->relu = mCoreMLBackend->create<CoreML__Specification__ActivationReLU>();
|
||||
core_ml__specification__activation_re_lu__init(reluLayer->activation->relu);
|
||||
setLayerInputsAndOutputs(reluLayer, {binaryOutputName}, {mCoreMLBackend->getTensorName(outputs[0])});
|
||||
mCoreMLBackend->addLayer(reluLayer);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -82,6 +82,9 @@ CPURuntime::CPURuntime(const Backend::Info& info) {
|
|||
ThreadPool::active();
|
||||
}
|
||||
#endif
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("create CPURuntime:%p\n", this);
|
||||
#endif
|
||||
}
|
||||
CPURuntime:: ~ CPURuntime() {
|
||||
#ifdef MNN_USE_THREAD_POOL
|
||||
|
|
@ -96,6 +99,9 @@ float CPURuntime::onGetMemoryInMB() {
|
|||
return staticMemoryInMB;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
Backend* CPURuntime::onCreate(const BackendConfig* config) const {
|
||||
auto precision = mPrecision;
|
||||
size_t flags = mFlags;
|
||||
|
|
@ -103,6 +109,10 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
|
|||
precision = config->precision;
|
||||
flags = config->flags;
|
||||
}
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("cpu backend was created by runtime:%p\n", this);
|
||||
#endif
|
||||
|
||||
#ifdef MNN_USE_ARMV82
|
||||
auto core = MNNGetCoreFunctions();
|
||||
if (core->supportFp16arith && precision == BackendConfig::Precision_Low) {
|
||||
|
|
@ -122,6 +132,7 @@ Backend* CPURuntime::onCreate(const BackendConfig* config) const {
|
|||
return new AVX2Backend(this, flags);
|
||||
}
|
||||
#endif
|
||||
|
||||
return new CPUBackend(this, precision, MNN_FORWARD_CPU, flags);
|
||||
}
|
||||
|
||||
|
|
@ -148,6 +159,21 @@ void CPURuntime::onGabageCollect(int level) {
|
|||
mStaticAllocator->release(false);
|
||||
}
|
||||
|
||||
|
||||
ReuseCopyTensorMap& CPURuntime::getReuseCopyTensorMap() {
|
||||
return mReuseCopyTensorMap;
|
||||
}
|
||||
|
||||
void CPURuntime::clearReuseCopyTensorMap() {
|
||||
for (auto& iter : mReuseCopyTensorMap) {
|
||||
Tensor* tensor = std::get<2>(iter.second);
|
||||
if (TensorUtils::getDescribe(tensor)->useCount > 0) {
|
||||
TensorUtils::getDescribe(tensor)->useCount--;
|
||||
}
|
||||
}
|
||||
mReuseCopyTensorMap.clear();
|
||||
}
|
||||
|
||||
void CPURuntime::onConcurrencyBegin() const {
|
||||
#ifdef MNN_USE_THREAD_POOL
|
||||
if (mThreadNumber > 1 && mPower != BackendConfig::Power_High) {
|
||||
|
|
@ -189,7 +215,10 @@ bool CPUBackend::addCreator(OpType t, Creator* c) {
|
|||
}
|
||||
|
||||
CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode precision, MNNForwardType type, size_t flags) : Backend(type) {
|
||||
mRuntime = runtime;
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("cpu backend create\n");
|
||||
#endif
|
||||
mRuntime = const_cast<CPURuntime*>(runtime);
|
||||
std::shared_ptr<BufferAllocator::Allocator> defaultAlloc(BufferAllocator::Allocator::createRecurse(runtime->mStaticAllocator.get()));
|
||||
mDynamicAllocator.reset(new BufferAllocator(defaultAlloc));
|
||||
mStaticAllocator = runtime->mStaticAllocator;
|
||||
|
|
@ -485,6 +514,9 @@ Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::v
|
|||
}
|
||||
return exe;
|
||||
}
|
||||
const Runtime* CPUBackend::getRuntime() {
|
||||
return mRuntime;
|
||||
}
|
||||
|
||||
bool CPUBackend::onClearBuffer() {
|
||||
mCache->reset();
|
||||
|
|
@ -540,6 +572,16 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
wrapTensor->setType(dstType);
|
||||
}
|
||||
wrapTensor->buffer().host = (uint8_t*)MNNMemoryAllocAlign(getTensorSize(wrapTensor.get()) * wrapTensor->getType().bytes(), MNN_MEMORY_ALIGN_DEFAULT);
|
||||
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("CPU backend copy tensor ptr:%p -> ptr:%p hostPtr:%p -> %p, format %d -> %d, dims: [",
|
||||
srcTensor, dstTensor, srcTensor->host<void>(), dstTensor->host<void>(), TensorUtils::getDescribe(srcTensor)->dimensionFormat, TensorUtils::getDescribe(dstTensor)->dimensionFormat);
|
||||
for (int i=0; i<srcTensor->dimensions(); ++i) {
|
||||
MNN_PRINT("%d ", srcTensor->length(i));
|
||||
}
|
||||
MNN_PRINT("]\n");
|
||||
#endif
|
||||
|
||||
TensorUtils::getDescribe(wrapTensor.get())->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
|
||||
auto code = CPUCastCreator::cast(srcTensor, wrapTensor.get(), this, convertType);
|
||||
if (NO_ERROR != code) {
|
||||
|
|
@ -556,6 +598,15 @@ void CPUBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
}
|
||||
}
|
||||
|
||||
ReuseCopyTensorMap& CPUBackend::getReuseCopyTensorMap() {
|
||||
return mRuntime->getReuseCopyTensorMap();
|
||||
}
|
||||
|
||||
void CPUBackend::clearReuseCopyTensorMap() {
|
||||
mRuntime->clearReuseCopyTensorMap();
|
||||
}
|
||||
|
||||
|
||||
class CPURuntimeCreator : public RuntimeCreator {
|
||||
public:
|
||||
virtual Runtime* onCreate(const Backend::Info& info) const override {
|
||||
|
|
|
|||
|
|
@ -29,6 +29,9 @@ public:
|
|||
virtual CompilerType onGetCompilerType() const override {
|
||||
return Compiler_Loop;
|
||||
}
|
||||
MNN_PUBLIC ReuseCopyTensorMap& getReuseCopyTensorMap();
|
||||
|
||||
MNN_PUBLIC void clearReuseCopyTensorMap();
|
||||
void onConcurrencyBegin() const;
|
||||
void onConcurrencyEnd() const;
|
||||
|
||||
|
|
@ -39,6 +42,7 @@ private:
|
|||
BackendConfig::MemoryMode mMemory;
|
||||
BackendConfig::PowerMode mPower;
|
||||
BackendConfig::PrecisionMode mPrecision;
|
||||
ReuseCopyTensorMap mReuseCopyTensorMap;
|
||||
|
||||
// Backend features
|
||||
// CPU features
|
||||
|
|
@ -71,7 +75,6 @@ public:
|
|||
const CoreFunctions* functions() const {
|
||||
return mCoreFunctions;
|
||||
}
|
||||
|
||||
// Return element size for Tensor, conside pack
|
||||
int getTensorSize(const Tensor* tensor, bool multiBytes = false) const;
|
||||
const CoreInt8Functions* int8Functions() const {
|
||||
|
|
@ -106,12 +109,22 @@ public:
|
|||
CPUResizeCache* getCache() const {
|
||||
return mCache;
|
||||
}
|
||||
|
||||
virtual const Runtime* getRuntime() override;
|
||||
|
||||
ReuseCopyTensorMap& getReuseCopyTensorMap();
|
||||
|
||||
void clearReuseCopyTensorMap();
|
||||
|
||||
|
||||
#ifdef MNN_USE_THREAD_POOL
|
||||
inline int taskIndex() const {return mRuntime->mTaskIndex;}
|
||||
#endif
|
||||
static void initCreatorMap();
|
||||
static int getBytes(const Backend* backend, const Tensor* output);
|
||||
static DataType getDataType(const Tensor* tensor);
|
||||
|
||||
|
||||
protected:
|
||||
MemObj* allocBuffer(int size, Tensor* dest, StorageType storageType);
|
||||
const CoreFunctions* mCoreFunctions;
|
||||
|
|
@ -119,7 +132,7 @@ protected:
|
|||
private:
|
||||
std::shared_ptr<BufferAllocator> mStaticAllocator;
|
||||
std::shared_ptr<BufferAllocator> mDynamicAllocator;
|
||||
const CPURuntime* mRuntime;
|
||||
CPURuntime* mRuntime;
|
||||
BackendConfig::PrecisionMode mPrecisionMode;
|
||||
static std::map<OpType, CPUBackend::Creator*>* gCreator;
|
||||
std::map<const Tensor*, const Tensor*> mCachedCastTensor;
|
||||
|
|
|
|||
|
|
@ -33,6 +33,11 @@ ErrorCode CPUBinary::onResize(const std::vector<Tensor*>& inputs, const std::vec
|
|||
mTotalSize = input0DataCount;
|
||||
}
|
||||
MNN_ASSERT(mTotalSize == outputs[0]->elementSize());
|
||||
|
||||
if(mActivationType == 1 && outputs[0]->getType().code == halide_type_float) {
|
||||
mActivationExe.reset(new CPURelu(backend(), 0.0));
|
||||
mActivationExe->onResize(outputs, outputs);
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -59,7 +64,9 @@ ErrorCode CPUBinary::onExecute(const std::vector<Tensor*>& inputs, const std::ve
|
|||
auto schedule = ((CPUBackend*)backend())->multiThreadDivide(mTotalSize);
|
||||
auto input0Ptr = input->host<uint8_t>();
|
||||
auto input1Ptr = input1->host<uint8_t>();
|
||||
auto outputPtr = output->host<uint8_t>();
|
||||
|
||||
auto outputPtr = outputs[0]->host<uint8_t>();
|
||||
|
||||
int inpBytes = input->getType().bytes();
|
||||
int outBytes = output->getType().bytes();
|
||||
if (halide_type_float == input->getType().code) {
|
||||
|
|
@ -85,9 +92,20 @@ ErrorCode CPUBinary::onExecute(const std::vector<Tensor*>& inputs, const std::ve
|
|||
}
|
||||
auto out = outputPtr + start * outBytes;
|
||||
mProc(out, inp0, inp1, realSize, mNeedBroadcastIndex);
|
||||
if(mActivationType == 1 && output->getType().code == halide_type_int) {
|
||||
for(int i=0; i<realSize; i++) {
|
||||
auto val = ((int32_t *)out)[i];
|
||||
auto res = val > 0 ? val : 0;
|
||||
((int32_t *)out)[i] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
|
||||
if(mActivationType == 1 && output->getType().code == halide_type_float) {
|
||||
mActivationExe->onExecute(outputs, outputs);;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -215,13 +233,13 @@ public:
|
|||
if (nullptr == func) {
|
||||
return nullptr;
|
||||
}
|
||||
return new CPUBinary(backend, func);
|
||||
return new CPUBinary(backend, func, op->main_as_BinaryOp()->activationType());
|
||||
} else if (dataType.code == halide_type_float) {
|
||||
auto func = core->MNNSelectBinaryFunctionForFloat(type);
|
||||
if (nullptr == func) {
|
||||
return nullptr;
|
||||
}
|
||||
return new CPUBinary(backend, func);
|
||||
return new CPUBinary(backend, func, op->main_as_BinaryOp()->activationType());
|
||||
}
|
||||
}
|
||||
MNN_ERROR("CpuBinary: unsupported data type (bits: %d, code: %d)\n",
|
||||
|
|
|
|||
|
|
@ -10,12 +10,14 @@
|
|||
#define CPUBinary_hpp
|
||||
|
||||
#include "core/Execution.hpp"
|
||||
#include "backend/cpu/CPURelu.hpp"
|
||||
#include "compute/CommonOptFunction.h"
|
||||
namespace MNN {
|
||||
class CPUBinary : public Execution {
|
||||
public:
|
||||
CPUBinary(Backend *b, MNNBinaryExecute proc) : Execution(b) {
|
||||
CPUBinary(Backend *b, MNNBinaryExecute proc, int activationType) : Execution(b) {
|
||||
mProc = proc;
|
||||
mActivationType = activationType;
|
||||
}
|
||||
virtual ~CPUBinary() = default;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
|
@ -26,6 +28,8 @@ private:
|
|||
MNNBinaryExecute mProc;
|
||||
int mNeedBroadcastIndex = -1;
|
||||
int mTotalSize;
|
||||
int mActivationType = 0;
|
||||
std::shared_ptr<Execution> mActivationExe;
|
||||
};
|
||||
} // namespace MNN
|
||||
#endif /* CPUBinary_hpp */
|
||||
|
|
|
|||
|
|
@ -29,24 +29,13 @@ ErrorCode CPUHistogram::histogram(Tensor* input, Tensor* output) {
|
|||
auto iptr = input->host<T>() + mChannel;
|
||||
auto optr = output->host<float>();
|
||||
memset(optr, 0, mBinNum * sizeof(float));
|
||||
auto numberThread = ((CPUBackend*)backend())->threadNumber();
|
||||
int sizeDivide = mSize / numberThread;
|
||||
MNN_CONCURRENCY_BEGIN(tId, numberThread) {
|
||||
int number = sizeDivide;
|
||||
if (tId == numberThread - 1) {
|
||||
number = mSize - tId * sizeDivide;
|
||||
}
|
||||
auto src = iptr + tId * sizeDivide * mStride * sizeof(T);
|
||||
auto dst = optr + tId * sizeDivide * mStride * sizeof(float);
|
||||
for (int i = 0; i < number; i++) {
|
||||
T val = src[i * mStride];
|
||||
if (val >= mMin && val <= mMax) {
|
||||
const int bin = (int)(val * mAlpha - mBeta);
|
||||
dst[std::min(bin, mBinNum -1)]++;
|
||||
}
|
||||
for (int i = 0; i < mSize; i++) {
|
||||
T val = iptr[i * mStride];
|
||||
if (val >= mMin && val <= mMax) {
|
||||
const int bin = (int)(val * mAlpha - mBeta);
|
||||
optr[std::min(bin, mBinNum -1)]++;
|
||||
}
|
||||
}
|
||||
MNN_CONCURRENCY_END();
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -54,6 +54,7 @@ extern void ___CPURasterFactory__OpType_While__();
|
|||
extern void ___CPUConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
|
||||
extern void ___CPURangeCreator__OpType_Range__();
|
||||
extern void ___CPUTFQuantizedConv2DCreator__OpType_TfQuantizedConv2D__();
|
||||
extern void ___CPUInterp3DCreator__OpType_Interp3D__();
|
||||
extern void ___CPUQuantizedAvgPoolCreator__OpType_QuantizedAvgPool__();
|
||||
extern void ___ConvolutionFactory__OpType_Convolution__();
|
||||
extern void ___CPUConvInt8Creator__OpType_ConvInt8__();
|
||||
|
|
@ -65,7 +66,6 @@ extern void ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
|
|||
extern void ___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
|
||||
extern void ___CPUSvdCreator__OpType_Svd__();
|
||||
extern void ___CPULayerNormCreator__OpType_LayerNorm__();
|
||||
extern void ___CPUInterp3DCreator__OpType_Interp3D__();
|
||||
|
||||
void registerCPUOps() {
|
||||
___CPUCropAndResizeCreator__OpType_CropAndResize__();
|
||||
|
|
@ -122,6 +122,7 @@ ___CPURasterFactory__OpType_While__();
|
|||
___CPUConvolutionDepthwiseCreator__OpType_ConvolutionDepthwise__();
|
||||
___CPURangeCreator__OpType_Range__();
|
||||
___CPUTFQuantizedConv2DCreator__OpType_TfQuantizedConv2D__();
|
||||
___CPUInterp3DCreator__OpType_Interp3D__();
|
||||
___CPUQuantizedAvgPoolCreator__OpType_QuantizedAvgPool__();
|
||||
___ConvolutionFactory__OpType_Convolution__();
|
||||
___CPUConvInt8Creator__OpType_ConvInt8__();
|
||||
|
|
@ -133,6 +134,5 @@ ___CPUSetDiff1DCreator__OpType_SetDiff1D__();
|
|||
___CPUEltwiseInt8Creator__OpType_EltwiseInt8__();
|
||||
___CPUSvdCreator__OpType_Svd__();
|
||||
___CPULayerNormCreator__OpType_LayerNorm__();
|
||||
___CPUInterp3DCreator__OpType_Interp3D__();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,9 +21,7 @@ template <typename T>
|
|||
ErrorCode CPURange<T>::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
|
||||
const T start = inputs[0]->host<T>()[0];
|
||||
const T delta = inputs[2]->host<T>()[0];
|
||||
|
||||
int32_t outputSize = outputs[0]->buffer().dim[0].extent;
|
||||
|
||||
auto flat = outputs[0]->host<T>();
|
||||
T val = start;
|
||||
for (int32_t i = 0; i < outputSize; ++i) {
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ bool AVX2Functions::init(int cpuFlags) {
|
|||
_AVX_ExtraInitFMA(coreFunction);
|
||||
}
|
||||
// For ImageProcess Functions
|
||||
_SSE_ImageProcessInit(coreFunction);
|
||||
_SSE_ImageProcessInit(coreFunction, cpuFlags);
|
||||
#ifdef MNN_AVX512
|
||||
if ((cpuFlags & libyuv::kCpuHasAVX512VNNI)
|
||||
|| (cpuFlags & libyuv::kCpuHasAVX512VL)
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ void MNNFunctionInit() {
|
|||
void MNNInt8FunctionInit() {
|
||||
auto cpuFlags = libyuv::InitCpuFlags();
|
||||
auto core = MNN::MNNGetInt8CoreFunctions();
|
||||
if (cpuFlags & libyuv::kCpuHasSSSE3) {
|
||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||
core->MNNFloat2Int8 = _SSE_MNNFloat2Int8;
|
||||
core->MNNInt8ScaleToFloat = _SSE_MNNInt8ScaleToFloat;
|
||||
core->Int8GemmKernel = _SSE_MNNGemmInt8AddBiasScale_16x4_Unit;
|
||||
|
|
|
|||
|
|
@ -68,4 +68,4 @@ void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size);
|
|||
void _SSE_MNNSoftmax(float* dest, const float* source, size_t size);
|
||||
void _SSE_ExtraInit(void* functions);
|
||||
void _SSE_MNNNorm(float *dst, const float *src, const float *gamma, const float *beta, float epsilon, size_t size);
|
||||
void _SSE_ImageProcessInit(void* functions);
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags);
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
|
||||
size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst) {
|
||||
const auto dst_step_tmp = dst_step / sizeof(int8_t);
|
||||
|
|
@ -231,6 +232,7 @@ void _SSE_MNNReluInt8(int8_t* dst, const int8_t* src, size_t size) {
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, ssize_t zeroPoint) {
|
||||
__m128i zero = _mm_set1_epi32(0);
|
||||
__m128 minValue = _mm_set1_ps(minV);
|
||||
|
|
@ -325,6 +327,7 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step) {
|
||||
auto dst = dstO;
|
||||
auto src = (const int16_t*)srcO;
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
#include <algorithm>
|
||||
#include "FunctionSummary.hpp"
|
||||
#include "core/Macro.h"
|
||||
#include "backend/cpu/x86_x64/cpu_id.h"
|
||||
|
||||
#define MNN_SSE_YUV_INIT \
|
||||
countUnit -= 1;\
|
||||
|
|
@ -245,6 +246,7 @@ void _SSE_MNNNV21ToBGR(const unsigned char* source, unsigned char* dest, size_t
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count) {
|
||||
int remain = 0;
|
||||
int countC16 = count / 16;
|
||||
|
|
@ -275,6 +277,7 @@ void _SSE_MNNC1ToFloatC1(const unsigned char* source, float* dest, const float*
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float* mean, const float* normal,
|
||||
size_t count) {
|
||||
int remain = 0;
|
||||
|
|
@ -316,6 +319,7 @@ void _SSE_MNNC3ToFloatC3(const unsigned char* source, float* dest, const float*
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNC1ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal,
|
||||
size_t count) {
|
||||
::memset(dest, 0, 4 * sizeof(float) * count);
|
||||
|
|
@ -382,6 +386,7 @@ void _SSE_MNNC1ToFloatRGBA(const unsigned char* source, float* dest, const float
|
|||
}
|
||||
}
|
||||
|
||||
// require SSE 4.1
|
||||
void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float* mean, const float* normal, size_t count) {
|
||||
int remain = 0;
|
||||
int countC4 = count / 4;
|
||||
|
|
@ -424,14 +429,16 @@ void _SSE_MNNC3ToFloatRGBA(const unsigned char* source, float* dest, const float
|
|||
}
|
||||
}
|
||||
|
||||
void _SSE_ImageProcessInit(void* functions) {
|
||||
void _SSE_ImageProcessInit(void* functions, int cpuFlags) {
|
||||
auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
|
||||
coreFunction->MNNRGBAToBGRA = _SSE_MNNRGBAToBGRA;
|
||||
coreFunction->MNNNV21ToRGBA = _SSE_MNNNV21ToRGBA;
|
||||
coreFunction->MNNNV21ToRGB = _SSE_MNNNV21ToRGB;
|
||||
coreFunction->MNNNV21ToBGRA = _SSE_MNNNV21ToBGRA;
|
||||
coreFunction->MNNNV21ToBGR = _SSE_MNNNV21ToBGR;
|
||||
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
|
||||
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
|
||||
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
|
||||
if (cpuFlags & libyuv::kCpuHasSSE41) {
|
||||
coreFunction->MNNC1ToFloatC1 = _SSE_MNNC1ToFloatC1;
|
||||
coreFunction->MNNC3ToFloatC3 = _SSE_MNNC3ToFloatC3;
|
||||
coreFunction->MNNC3ToFloatRGBA = _SSE_MNNC3ToFloatRGBA;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
|
|||
auto p7 = _mm_set1_ps(parameters[7]);
|
||||
auto xMax = _mm_set1_ps(87);
|
||||
auto xMin = _mm_set1_ps(-87);
|
||||
auto basic = _mm_set1_epi32(1 << 23);
|
||||
// auto basic = _mm_set1_epi32(1 << 23);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
auto x = _mm_mul_ps(_mm_loadu_ps(source + i * 4), A);
|
||||
x = _mm_max_ps(x, xMin);
|
||||
|
|
@ -36,7 +36,8 @@ void _SSE_MNNExpC8(float* dest, const float* source, const float* offset, const
|
|||
auto divInt = _mm_cvtps_epi32(div);
|
||||
div = _mm_cvtepi32_ps(divInt);
|
||||
auto div2 = _mm_add_epi32(divInt, _mm_set1_epi32(127));
|
||||
div2 = _mm_mullo_epi32(div2, basic);
|
||||
// div2 = _mm_mullo_epi32(div2, basic);
|
||||
div2 = _mm_slli_epi32(div2, 23);
|
||||
auto expBasic = _mm_castsi128_ps(div2);
|
||||
auto xReamin = _mm_sub_ps(x, _mm_mul_ps(div, p0));
|
||||
auto t = xReamin;
|
||||
|
|
@ -89,7 +90,7 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
|
|||
auto p7 = _mm_set1_ps(0.008333333333333333);
|
||||
auto xMax = _mm_set1_ps(87);
|
||||
auto xMin = _mm_set1_ps(-87);
|
||||
auto basic = _mm_set1_epi32(1 << 23);
|
||||
// auto basic = _mm_set1_epi32(1 << 23);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
auto x = _mm_sub_ps(_mm_loadu_ps(source + i * 4), _mm_set1_ps(maxValue));
|
||||
x = _mm_max_ps(x, xMin);
|
||||
|
|
@ -98,7 +99,8 @@ void _SSE_MNNSoftmax(float* dest, const float* source, size_t size) {
|
|||
auto divInt = _mm_cvtps_epi32(div);
|
||||
div = _mm_cvtepi32_ps(divInt);
|
||||
auto div2 = _mm_add_epi32(divInt, _mm_set1_epi32(127));
|
||||
div2 = _mm_mullo_epi32(div2, basic);
|
||||
// div2 = _mm_mullo_epi32(div2, basic);
|
||||
div2 = _mm_slli_epi32(div2, 23);
|
||||
auto expBasic = _mm_castsi128_ps(div2);
|
||||
auto xReamin = _mm_sub_ps(x, _mm_mul_ps(div, p0));
|
||||
auto t = xReamin;
|
||||
|
|
|
|||
|
|
@ -1,8 +1,14 @@
|
|||
set(CUDA_MIN_VERSION "9.0")
|
||||
find_package(CUDA ${CUDA_MIN_VERSION})
|
||||
|
||||
set (EXTRA_LIBS "")
|
||||
if(MNN_CUDA_PROFILE)
|
||||
set(EXTRA_LIBS -lnvToolsExt)
|
||||
endif()
|
||||
|
||||
|
||||
if(CUDA_FOUND)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_FORCE_INLINES -Wno-deprecated-gpu-targets -w")
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D_FORCE_INLINES -Wno-deprecated-gpu-targets -w ${EXTRA_LIBS}")
|
||||
if(CMAKE_BUILD_TYPE MATCHES Debug)
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O0")
|
||||
else()
|
||||
|
|
@ -15,6 +21,7 @@ if(CUDA_FOUND)
|
|||
include(${CMAKE_CURRENT_SOURCE_DIR}/SelectCudaComputeArch.cmake)
|
||||
CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${CUDA_ARCHS})
|
||||
|
||||
|
||||
IF ((CUDA_VERSION VERSION_GREATER "9.0") OR (CUDA_VERSION VERSION_EQUAL "9.0"))
|
||||
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
|
||||
ENDIF()
|
||||
|
|
@ -34,13 +41,20 @@ else()
|
|||
endif()
|
||||
|
||||
file(GLOB_RECURSE MNN_CUDA_SRC ${CMAKE_CURRENT_LIST_DIR}/core/* ${CMAKE_CURRENT_SOURCE_DIR}/execution/*)
|
||||
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!")
|
||||
message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!! ${CUDA_INCLUDE_DIRS}")
|
||||
|
||||
|
||||
if(WIN32)
|
||||
cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC})
|
||||
set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE)
|
||||
else()
|
||||
|
||||
cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
|
||||
if(MNN_CUDA_PROFILE)
|
||||
target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE)
|
||||
target_link_libraries(MNN_Cuda_Main ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
|
||||
endif()
|
||||
|
||||
set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE)
|
||||
add_library(MNN_CUDA OBJECT Register.cpp)
|
||||
endif()
|
||||
|
|
@ -52,3 +66,5 @@ include_directories(
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/../../../3rd_party/cutlass/include
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@
|
|||
#include "execution/Transpose.cuh"
|
||||
#include "execution/MNNCUDADefine.hpp"
|
||||
|
||||
#include "CUDATools.hpp"
|
||||
|
||||
// #define MNN_CUDA_COPY_DEBUG
|
||||
|
||||
namespace MNN {
|
||||
|
|
@ -47,6 +49,9 @@ private:
|
|||
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
|
||||
// TODO: Search CUDA Device info and use best one
|
||||
mCUDARuntime.reset(new CUDARuntime(-1));
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("create cuda runtime:%p\n", mCUDARuntime.get());
|
||||
#endif
|
||||
if (mCUDARuntime.get()) {
|
||||
if (mCUDARuntime->isCreateError() == true) {
|
||||
mIsCreateError = true;
|
||||
|
|
@ -66,6 +71,9 @@ float CUDARuntimeWrapper::onGetMemoryInMB() {
|
|||
}
|
||||
|
||||
Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("cudaruntime:%p, create CUDABackend\n", this);
|
||||
#endif
|
||||
auto mode = mDefaultPrecision;
|
||||
if (nullptr != config) {
|
||||
mode = config->precision;
|
||||
|
|
@ -78,9 +86,13 @@ void CUDARuntimeWrapper::onGabageCollect(int level) {
|
|||
mBufferPool->release(false);
|
||||
}
|
||||
|
||||
|
||||
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
|
||||
std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32)
|
||||
: Backend(MNN_FORWARD_CUDA) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("cuda backend create\n");
|
||||
#endif
|
||||
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
|
||||
mStaticBufferPool = st;
|
||||
mCUDARuntime = rt;
|
||||
|
|
@ -97,6 +109,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() {
|
|||
MNN_ASSERT(nullptr != mCUDARuntime.get());
|
||||
return mCUDARuntime.get();
|
||||
}
|
||||
const Runtime* CUDABackend::getRuntime() {
|
||||
return (const Runtime*)mCUDARuntime.get();
|
||||
}
|
||||
bool CUDABackend::useFp16() const {
|
||||
return mUseFp16AsFp32;
|
||||
}
|
||||
|
|
@ -128,6 +143,7 @@ CPUResizeCache* CUDABackend::getCache() {
|
|||
}
|
||||
|
||||
Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
|
||||
// MNN_PRINT("onAcquire CUDA memory for tensor:%p\n", nativeTensor);
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
|
||||
#endif
|
||||
|
|
@ -181,17 +197,16 @@ size_t CUDABackend::realSize(const Tensor* tensor) {
|
|||
|
||||
Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("Start CUDABackend::onCreate \n");
|
||||
#endif
|
||||
// #ifdef LOG_VERBOSE
|
||||
// MNN_PRINT("Start CUDABackend::onCreate useFp16:%d\n", useFp16());
|
||||
// #endif
|
||||
auto creators = gCreator();
|
||||
auto iter = creators->find(op->type());
|
||||
|
||||
if (iter == creators->end()) {
|
||||
if (nullptr != op->name()) {
|
||||
MNN_PRINT("Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
MNN_PRINT("CUDABackend Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
} else {
|
||||
MNN_PRINT("Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
MNN_PRINT("CUDABackend Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -199,9 +214,9 @@ Execution* CUDABackend::onCreate(const std::vector<Tensor*>& inputs, const std::
|
|||
auto exe = iter->second->onCreate(inputs, outputs, op, this);
|
||||
if (NULL == exe) {
|
||||
if (nullptr != op->name()) {
|
||||
MNN_PRINT("The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
MNN_PRINT("CUDABackend The Creator Don't support type %s, %s\n", EnumNameOpType(op->type()), op->name()->c_str());
|
||||
} else {
|
||||
MNN_PRINT("The Creator Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
MNN_PRINT("CUDABackend The Creator Don't support type %s\n", EnumNameOpType(op->type()));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -222,6 +237,7 @@ void CUDABackend::onExecuteBegin() const {
|
|||
|
||||
void CUDABackend::onExecuteEnd() const {
|
||||
}
|
||||
|
||||
static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
|
||||
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
srcStride[0] = plane * srcPack;
|
||||
|
|
@ -283,9 +299,11 @@ static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch,
|
|||
}
|
||||
|
||||
void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
||||
|
||||
|
||||
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
|
||||
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
|
||||
auto srcIndex = TensorUtils::getDescribe(srcTensor)->index;
|
||||
auto dstIndex = TensorUtils::getDescribe(dstTensor)->index;
|
||||
auto srcDevice = srcTensor->deviceId() != 0;
|
||||
auto dstDevice = dstTensor->deviceId() != 0;
|
||||
MNN_ASSERT(srcDevice || dstDevice);
|
||||
|
|
@ -293,18 +311,6 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
std::pair<void*, int> tempSrcStorage;
|
||||
auto bytes = getBytes(srcTensor);
|
||||
auto type = srcTensor->getType();
|
||||
#ifdef MNN_CUDA_COPY_DEBUG
|
||||
checkKernelErrors;
|
||||
MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat);
|
||||
for (int i=0; i<srcTensor->dimensions(); ++i) {
|
||||
MNN_PRINT("%d ", srcTensor->length(i));
|
||||
if(srcDevice && !dstDevice) {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
MNN_PRINT("], ");
|
||||
MNN_PRINT("addr:%p %p\n", srcTensor->deviceId(), dstTensor->deviceId());
|
||||
#endif
|
||||
|
||||
//printf("%d-%d\n", srcTensor->dimensions(), dstTensor->dimensions());
|
||||
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
|
||||
|
|
@ -315,17 +321,40 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MNN_CUDA_COPY_DEBUG
|
||||
checkKernelErrors;
|
||||
MNN_PRINT("CUDA Bn copy tensor ptr:%p -> ptr:%p deviceId:%d -> %d, hostPtr:%p -> %p, graphIndex: %d -> %d, format %d -> %d, directCopy: %d, dims: [",
|
||||
srcTensor, dstTensor, srcTensor->deviceId(), dstTensor->deviceId(), srcTensor->host<void>(), dstTensor->host<void>(), srcIndex, dstIndex, srcDimensionFormat, dstDimensionFormat, directCopy);
|
||||
|
||||
for (int i=0; i<srcTensor->dimensions(); ++i) {
|
||||
MNN_PRINT("%d ", srcTensor->length(i));
|
||||
if(srcDevice && !dstDevice) {
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
MNN_PRINT("], ");
|
||||
MNN_PRINT("addr:%p %p\n", srcTensor->deviceId(), dstTensor->deviceId());
|
||||
#endif
|
||||
|
||||
|
||||
if (directCopy) {
|
||||
auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
|
||||
if (srcDevice && dstDevice) {
|
||||
NVTX_PUSH("DtoD");
|
||||
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
|
||||
MNNMemcpyDeviceToDevice, true);
|
||||
NVTX_POP();
|
||||
} else if (srcDevice && (!dstDevice)) {
|
||||
NVTX_PUSH("DtoH");
|
||||
mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
|
||||
MNNMemcpyDeviceToHost, true);
|
||||
NVTX_POP();
|
||||
} else if ((!srcDevice) && (dstDevice)) {
|
||||
NVTX_PUSH("HtoD");
|
||||
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
|
||||
MNNMemcpyHostToDevice, true);
|
||||
NVTX_POP();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -348,6 +377,7 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
dstPtr = (uint8_t*)dstTensor->deviceId();
|
||||
}
|
||||
|
||||
NVTX_PUSH("copy convert");
|
||||
// Format convert
|
||||
int batch, plane, channel;
|
||||
_computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
|
||||
|
|
@ -368,8 +398,9 @@ void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor)
|
|||
auto cpuSize = dstTensor->size();
|
||||
mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
|
||||
true);
|
||||
mStaticBufferPool->free(tempDstStorage);
|
||||
mStaticBufferPool->free(tempDstStorage);
|
||||
}
|
||||
NVTX_POP();
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ public:
|
|||
|
||||
private:
|
||||
std::shared_ptr<BufferAllocator> mBufferPool;
|
||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||
std::shared_ptr<CUDARuntime> mCUDARuntime;
|
||||
bool mIsCreateError{false};
|
||||
BackendConfig::PrecisionMode mDefaultPrecision;
|
||||
};
|
||||
|
|
@ -47,6 +47,7 @@ public:
|
|||
~CUDABackend();
|
||||
|
||||
CUDARuntime *getCUDARuntime();
|
||||
virtual const Runtime* getRuntime() override;
|
||||
virtual Backend::MemObj* onAcquire(const Tensor *nativeTensor, StorageType storageType) override;
|
||||
virtual bool onClearBuffer() override;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,167 @@
|
|||
// CUDATools.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on b'2022/09/05'.
|
||||
// Copyright © 2022, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef CUDATools_hpp
|
||||
|
||||
|
||||
/*
|
||||
|
||||
use nvprof by open the MACRO 'MNN_CUDA_PROFILE'.
|
||||
cmake .. -DMNN_CUDA_PROFILE=ON -DMNN_CUDA=ON
|
||||
|
||||
*/
|
||||
#ifdef MNN_CUDA_PROFILE
|
||||
#include "nvToolsExt.h"
|
||||
#define NVTX_PUSH(...) nvtxRangePushA(__VA_ARGS__)
|
||||
#define NVTX_POP(...) nvtxRangePop(__VA_ARGS__)
|
||||
#else
|
||||
#define NVTX_PUSH(...)
|
||||
#define NVTX_POP(...)
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef MNN_CUDA_PROFILE
|
||||
#include <stdio.h>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <cxxabi.h>
|
||||
#include <execinfo.h>
|
||||
|
||||
// print stack trace log
|
||||
class MNNLogMessage {
|
||||
public:
|
||||
MNNLogMessage(const char* file, int line, const char* function)
|
||||
:
|
||||
log_stream_(std::cout)
|
||||
{
|
||||
const char* pos = file;
|
||||
const char* onlyName = file;
|
||||
while (*pos != 0) {
|
||||
if (*pos == '/') {
|
||||
onlyName = pos + 1;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
log_stream_ << onlyName << ":"
|
||||
<< line << ":"
|
||||
<< function << ": ";
|
||||
}
|
||||
~MNNLogMessage() {
|
||||
log_stream_ << '\n';
|
||||
}
|
||||
std::ostream& stream() {
|
||||
return log_stream_;
|
||||
}
|
||||
std::string GetCachedString() {
|
||||
return "";
|
||||
}
|
||||
|
||||
protected:
|
||||
std::ostream& log_stream_;
|
||||
|
||||
private:
|
||||
MNNLogMessage(const MNNLogMessage&);
|
||||
void operator=(const MNNLogMessage&);
|
||||
};
|
||||
#if !defined(MNN_BUILD_FOR_ANDROID)
|
||||
inline std::string
|
||||
Demangle(char const* msg_str, std::ostringstream& os) {
|
||||
using std::string;
|
||||
string msg(msg_str);
|
||||
size_t symbol_start = string::npos;
|
||||
size_t symbol_end = string::npos;
|
||||
if (((symbol_start = msg.find("_Z")) != string::npos) &&
|
||||
(symbol_end = msg.find_first_of(" +", symbol_start))) {
|
||||
string left_of_symbol(msg, 0, symbol_start);
|
||||
string symbol(msg, symbol_start, symbol_end - symbol_start);
|
||||
string right_of_symbol(msg, symbol_end);
|
||||
|
||||
int status = 0;
|
||||
size_t length = string::npos;
|
||||
std::unique_ptr<char, void (*)(void* __ptr)> demangled_symbol = {
|
||||
abi::__cxa_demangle(symbol.c_str(), 0, &length, &status), &std::free};
|
||||
if (demangled_symbol && status == 0 && length > 0) {
|
||||
string symbol_str(demangled_symbol.get());
|
||||
os << left_of_symbol << symbol_str << right_of_symbol;
|
||||
return os.str();
|
||||
}
|
||||
}
|
||||
return string(msg_str);
|
||||
}
|
||||
|
||||
// By default skip the first frame because
|
||||
// that belongs to ~FatalLogMessage
|
||||
inline std::string
|
||||
StackTrace(size_t start_frame = 2, const size_t stack_size = 12) {
|
||||
using std::string;
|
||||
std::ostringstream stacktrace_os;
|
||||
std::vector<void*> stack(stack_size);
|
||||
int nframes = backtrace(stack.data(), static_cast<int>(stack_size));
|
||||
stacktrace_os << "Stack trace:\n";
|
||||
char** msgs = backtrace_symbols(stack.data(), nframes);
|
||||
if (msgs != nullptr) {
|
||||
for (int frameno = start_frame; frameno < nframes; ++frameno) {
|
||||
stacktrace_os << " [bt] (" << frameno - start_frame << ") ";
|
||||
string msg = Demangle(msgs[frameno], stacktrace_os);
|
||||
stacktrace_os << "\n";
|
||||
}
|
||||
}
|
||||
free(msgs);
|
||||
string stack_trace = stacktrace_os.str();
|
||||
return stack_trace;
|
||||
}
|
||||
|
||||
|
||||
class TraceMNNLogMessage : public MNNLogMessage {
|
||||
public:
|
||||
TraceMNNLogMessage(const char* file, int line, const char* function) : MNNLogMessage(file, line, function) {}
|
||||
~TraceMNNLogMessage() {
|
||||
log_stream_ << "\n" << StackTrace(1);
|
||||
}
|
||||
};
|
||||
|
||||
#define MNN_LOG_TRACE TraceMNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
|
||||
|
||||
#else
|
||||
#define MNN_LOG_TRACE MNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
|
||||
#endif
|
||||
|
||||
#define MNN_LOG_INFO MNNLogMessage(__FILE__, __LINE__, __FUNCTION__)
|
||||
#define MNN_LOG(severity) MNN_LOG_##severity.stream()
|
||||
#define MNN_LOG_IF(severity, condition) \
|
||||
!(condition) ? (void)0 : MNNLogMessageVoidify() & MNN_LOG(severity)
|
||||
|
||||
// deal with release/debug
|
||||
// release mode
|
||||
#if !defined(NDEBUG)
|
||||
#define MNN_DLOG(severity) MNN_LOG(severity)
|
||||
#define MNN_DLOG_IF(severity, condition) MNN_LOG_IF(severity, (condition))
|
||||
|
||||
// debug mode, all MNN_DLOG() code would be compiled as empty.
|
||||
#else
|
||||
#define MNN_DLOG(severity) MNN_LOG_IF(severity, false)
|
||||
#define MNN_DLOG_IF(severity, condition) MNN_LOG_IF(severity, false)
|
||||
|
||||
#endif
|
||||
|
||||
// This class is used to explicitly ignore values in the conditional
|
||||
// logging macros. This avoids compiler warnings like "value computed
|
||||
// is not used" and "statement has no effect".
|
||||
class MNNLogMessageVoidify {
|
||||
public:
|
||||
MNNLogMessageVoidify() {}
|
||||
// This has to be an operator with a precedence lower than << but
|
||||
// higher than "?:". See its usage.
|
||||
void operator&(std::ostream&) {}
|
||||
};
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -15,14 +15,9 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
#include "core/Macro.h"
|
||||
// #define MNN_CUDA_USE_BLAS
|
||||
|
||||
//#define MNN_OPEN_TIME_TRACE
|
||||
#include <MNN/AutoTime.hpp>
|
||||
#define STR_HELPER(x) #x
|
||||
#define STR(x) STR_HELPER(x)
|
||||
|
||||
#undef STR
|
||||
#undef STR_HELPER
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
|
@ -32,7 +27,7 @@ bool CUDARuntime::isCreateError() const {
|
|||
|
||||
CUDARuntime::CUDARuntime(int device_id) {
|
||||
#ifdef LOG_VERBOSE
|
||||
MNN_PRINT("start CUDARuntime !\n");
|
||||
MNN_PRINT("start CUDARuntime id:%d\n", device_id);
|
||||
#endif
|
||||
int version;
|
||||
cuda_check(cudaRuntimeGetVersion(&version));
|
||||
|
|
@ -40,6 +35,7 @@ CUDARuntime::CUDARuntime(int device_id) {
|
|||
if (id < 0) {
|
||||
cuda_check(cudaGetDevice(&id));
|
||||
}
|
||||
// printf("use GPU device id:%d\n", id);
|
||||
// id = selectDeviceMaxFreeMemory();
|
||||
// cuda_check(cudaSetDevice(id));
|
||||
|
||||
|
|
@ -68,23 +64,24 @@ int CUDARuntime::selectDeviceMaxFreeMemory() {
|
|||
cudaDeviceProp deviceProp;
|
||||
int deviceCount;
|
||||
cuda_check(cudaGetDeviceCount(&deviceCount));
|
||||
|
||||
|
||||
// Check id:0 card info
|
||||
int id = 0;
|
||||
cuda_check(cudaSetDevice(0));
|
||||
size_t total_size, free_size_max;
|
||||
cuda_check(cudaMemGetInfo(&free_size_max, &total_size));
|
||||
//printf("card:0, free:%zu, total:%zu\n", free_size_max, total_size);
|
||||
size_t total_size = 0, free_size_max = 0;
|
||||
cudaError_t memStatus = cudaMemGetInfo(&free_size_max, &total_size);
|
||||
cuda_check(memStatus);
|
||||
// printf("card:0, free:%zu, total:%zu, memStatusSuccess:%d\n", free_size_max, total_size, memStatus == cudaSuccess);
|
||||
|
||||
for(int i = 1; i < deviceCount; i++) {
|
||||
cuda_check(cudaSetDevice(i));
|
||||
size_t free_size;
|
||||
cuda_check(cudaMemGetInfo(&free_size, &total_size));
|
||||
cuda_check(cudaMemGetInfo(&free_size, &total_size));
|
||||
if(free_size > free_size_max) {
|
||||
free_size_max = free_size;
|
||||
id = i;
|
||||
}
|
||||
//printf("card:%d, free:%zu, total:%zu\n", i, free_size, total_size);
|
||||
}
|
||||
// printf("card:%d, free:%zu, total:%zu\n", i, free_size, total_size);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
|
|
@ -102,7 +99,7 @@ size_t CUDARuntime::blocks_num(const size_t total_threads) {
|
|||
// } else {
|
||||
// mThreadPerBlock = 128;
|
||||
// }
|
||||
|
||||
|
||||
mThreadPerBlock = 128;
|
||||
return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
|
||||
}
|
||||
|
|
@ -170,4 +167,7 @@ void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) {
|
|||
cuda_check(cudaMemset(dst, value, size_in_bytes));
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "ArgMaxExecution.hpp"
|
||||
#include "ArgMinExecution.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
|
@ -14,9 +15,9 @@ __global__ void ARGMAX(const int count, const int outside, const int inside, con
|
|||
int* outPtr = output + inside * o;
|
||||
const T* inpPtr = input + inside * dim * o;
|
||||
int index = 0;
|
||||
T maxValue = inpPtr[n+0*inside];
|
||||
T maxValue = inpPtr[n + 0 * inside];
|
||||
for(int j=1; j<dim; j++) {
|
||||
T value = inpPtr[n+j*inside];
|
||||
T value = inpPtr[n + j * inside];
|
||||
if(maxValue < value) {
|
||||
index = j;
|
||||
maxValue = value;
|
||||
|
|
@ -24,6 +25,7 @@ __global__ void ARGMAX(const int count, const int outside, const int inside, con
|
|||
}
|
||||
outPtr[n] = index;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
ArgMaxExecution::ArgMaxExecution(const Op* op, Backend *backend) : Execution(backend) {
|
||||
|
|
@ -60,10 +62,10 @@ ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
|
||||
auto input = (void *)inputs[0]->deviceId();
|
||||
auto output = (void *)outputs[0]->deviceId();
|
||||
|
||||
int count = mOutside * mInside;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int thread_num = runtime->threads_num();
|
||||
|
||||
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
|
||||
|
||||
if(bytes == 4) {
|
||||
|
|
@ -73,7 +75,7 @@ ErrorCode ArgMaxExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
ARGMAX<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const half*)input,(int *)output);
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
class ArgMaxCreator : public CUDABackend::Creator {
|
||||
|
|
@ -84,10 +86,17 @@ public:
|
|||
if (TensorUtils::getDescribe(input)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
return nullptr;
|
||||
}
|
||||
return new ArgMaxExecution(op, backend);
|
||||
if (op->type() == OpType_ArgMax) {
|
||||
return new ArgMaxExecution(op, backend);
|
||||
} else {
|
||||
return new ArgMinExecution(op, backend);
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
static CUDACreatorRegister<ArgMaxCreator> __init(OpType_ArgMax);
|
||||
static CUDACreatorRegister<ArgMaxCreator> __init_op2(OpType_ArgMin);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,90 @@
|
|||
//
|
||||
// ArgMinExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/06/29.
|
||||
// Copyright © 2018 - 2022, Alibaba Group Holding Limited
|
||||
//
|
||||
#include "ArgMinExecution.hpp"
|
||||
#include "core/TensorUtils.hpp"
|
||||
#include <MNN/AutoTime.hpp>
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
template <typename T>
|
||||
__global__ void ARGMIN(const int count, const int outside, const int inside, const int dim,
|
||||
const T *input, int *output) {
|
||||
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
|
||||
const int o = i / inside;
|
||||
const int n = i % inside;
|
||||
|
||||
int* outPtr = output + inside * o;
|
||||
const T* inpPtr = input + inside * dim * o;
|
||||
int index = 0;
|
||||
T minValue = inpPtr[n + 0 * inside];
|
||||
for(int j=1; j<dim; j++) {
|
||||
T value = inpPtr[n + j * inside];
|
||||
if(minValue > value) {
|
||||
index = j;
|
||||
minValue = value;
|
||||
}
|
||||
}
|
||||
outPtr[n] = index;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ArgMinExecution::ArgMinExecution(const Op* op, Backend *backend) : Execution(backend) {
|
||||
mOp = op;
|
||||
mAxis = mOp->main_as_ArgMax()->axis();
|
||||
}
|
||||
|
||||
ArgMinExecution::~ArgMinExecution(){
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
ErrorCode ArgMinExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
||||
if (mAxis < 0) {
|
||||
mAxis = input->dimensions() + mAxis;
|
||||
}
|
||||
|
||||
mInside = 1;
|
||||
mOutside = 1;
|
||||
for (int i=0; i<mAxis; ++i) {
|
||||
mOutside *= input->length(i);
|
||||
}
|
||||
for (int i=mAxis+1; i<input->dimensions(); ++i) {
|
||||
mInside *= input->length(i);
|
||||
}
|
||||
mDim = input->length(mAxis);
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode ArgMinExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto runtime = static_cast<CUDABackend *>(backend())->getCUDARuntime();
|
||||
|
||||
auto input = (void *)inputs[0]->deviceId();
|
||||
auto output = (void *)outputs[0]->deviceId();
|
||||
int count = mOutside * mInside;
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int thread_num = runtime->threads_num();
|
||||
|
||||
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
|
||||
|
||||
if(bytes == 4) {
|
||||
ARGMIN<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const float*)input,(int *)output);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
ARGMIN<<<block_num, thread_num>>>(count, mOutside, mInside, mDim, (const half*)input,(int *)output);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
//
|
||||
// ArgMinExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/06/29.
|
||||
// Copyright © 2018 - 2022, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef ArgMinExecution_hpp
|
||||
#define ArgMinExecution_hpp
|
||||
#include <vector>
|
||||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
class ArgMinExecution : public Execution {
|
||||
public:
|
||||
ArgMinExecution(const Op* op, Backend *backend);
|
||||
virtual ~ArgMinExecution();
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
const Op* mOp;
|
||||
int mAxis;
|
||||
int mInside;
|
||||
int mOutside;
|
||||
int mDim;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
||||
#endif
|
||||
|
|
@ -32,8 +32,9 @@ __global__ void LOGICALOR(const T *input0, const T* input1, T *output, size_t co
|
|||
}
|
||||
return;
|
||||
}
|
||||
BinaryExecution::BinaryExecution(int opType, Backend *backend) : Execution(backend) {
|
||||
BinaryExecution::BinaryExecution(int opType, Backend *backend, int activationType) : Execution(backend) {
|
||||
mType = opType;
|
||||
mActivationType = activationType;
|
||||
}
|
||||
BinaryExecution::~BinaryExecution(){
|
||||
// Do nothing
|
||||
|
|
@ -59,7 +60,7 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
|
|||
auto input0 = (uint8_t*)input0T->deviceId();
|
||||
auto input1 = (uint8_t*)input1T->deviceId();
|
||||
auto output = (uint8_t*)outputT->deviceId();
|
||||
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType);
|
||||
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType, mActivationType);
|
||||
};
|
||||
computeFunction(inputs[0], inputs[1], outputs[0]);
|
||||
for (int i=2; i<inputs.size(); ++i) {
|
||||
|
|
@ -72,7 +73,8 @@ public:
|
|||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
if (op->type() == OpType_BinaryOp) {
|
||||
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend);
|
||||
//MNN_PRINT("binary act:%d\n", op->main_as_BinaryOp()->activationType());
|
||||
return new BinaryExecution(op->main_as_BinaryOp()->opType(), backend, op->main_as_BinaryOp()->activationType());
|
||||
}
|
||||
if (op->type() == OpType_Eltwise) {
|
||||
switch (op->main_as_Eltwise()->type()) {
|
||||
|
|
|
|||
|
|
@ -15,12 +15,13 @@ namespace MNN {
|
|||
namespace CUDA {
|
||||
class BinaryExecution : public Execution {
|
||||
public:
|
||||
BinaryExecution(int opType, Backend *backend);
|
||||
BinaryExecution(int opType, Backend *backend, int activationType = 0);
|
||||
virtual ~BinaryExecution();
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
||||
private:
|
||||
int mType;
|
||||
int mActivationType;
|
||||
};
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -8,8 +8,6 @@
|
|||
|
||||
#include "ConvCutlassExecution.hpp"
|
||||
#include "Raster.cuh"
|
||||
#include "MNNCUDADefine.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
|
||||
//#define DEBUG
|
||||
|
||||
|
|
@ -230,8 +228,8 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
mGemmInfo.elh[1] = l;
|
||||
mGemmInfo.elh[2] = h;
|
||||
mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8;
|
||||
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;;
|
||||
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;;
|
||||
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;
|
||||
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;
|
||||
|
||||
//MNN_PRINT("Activate:%d \n", mActivationType);
|
||||
//MNN_PRINT("Im2Col:%d-%d-%d temp size:%zu!!!\n\n",output->width(), ic, mIm2ColParamter.kernelX, (size_t)sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
|
||||
|
|
@ -283,10 +281,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Relu_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF16ReluSm70.can_implement(arguments);
|
||||
|
|
@ -307,10 +305,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Relu_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF32ReluSm70.can_implement(arguments);
|
||||
|
|
@ -335,10 +333,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Relu6_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF16Relu6Sm70.can_implement(arguments);
|
||||
|
|
@ -359,10 +357,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Relu6_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF32Relu6Sm70.can_implement(arguments);
|
||||
|
|
@ -385,10 +383,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Linear_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
cutlass::Status status = mGemmF16LnSm70.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
|
@ -406,10 +404,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Linear_Sm70::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
cutlass::Status status = mGemmF32LnSm70.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
|
@ -435,10 +433,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Relu_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF16ReluSm75.can_implement(arguments);
|
||||
|
|
@ -459,10 +457,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Relu_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF32ReluSm75.can_implement(arguments);
|
||||
|
|
@ -487,10 +485,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Relu6_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF16Relu6Sm75.can_implement(arguments);
|
||||
|
|
@ -511,10 +509,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Relu6_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmF32Relu6Sm75.can_implement(arguments);
|
||||
|
|
@ -537,10 +535,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F16_Linear_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
cutlass::Status status = mGemmF16LnSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
|
@ -558,10 +556,10 @@ ErrorCode ConvCutlassExecution::onResize(const std::vector<Tensor*> &inputs, con
|
|||
split_k_slices}; // <- k-dimension split factor
|
||||
size_t workspace_size = Gemm_F32_Linear_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto buffer3 = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)buffer3.first + buffer3.second;
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(buffer3);
|
||||
pool->free(bufferWs);
|
||||
|
||||
cutlass::Status status = mGemmF32LnSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
|
|
|||
|
|
@ -11,15 +11,12 @@
|
|||
#include "backend/cuda/core/CUDABackend.hpp"
|
||||
#include "core/Execution.hpp"
|
||||
#include "CutlassGemmParam.hpp"
|
||||
#include "MNNCUDADefine.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
struct CutlassGemmInfo{
|
||||
int elh[3];
|
||||
int elhPad[3];
|
||||
};
|
||||
|
||||
class ConvCutlassExecution : public Execution {
|
||||
public:
|
||||
struct Resource {
|
||||
|
|
|
|||
|
|
@ -13,7 +13,10 @@ __global__ void CONV_DW(const T* input,
|
|||
const half* kernel,
|
||||
const half* bias,
|
||||
T *output,
|
||||
const constBuffer* uConstant
|
||||
const constBuffer* uConstant,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_ow,
|
||||
DivModFast d_oh
|
||||
) {
|
||||
float maxV = uConstant->maxValue;
|
||||
float minV = uConstant->minValue;
|
||||
|
|
@ -32,17 +35,18 @@ __global__ void CONV_DW(const T* input,
|
|||
int pw = uConstant->pad[0];
|
||||
int ph = uConstant->pad[1];
|
||||
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
|
||||
int i = index / c_p;
|
||||
int oz = index % c_p;
|
||||
int ob = i / (ow * oh);
|
||||
int tmp = i % (ow * oh);
|
||||
int oy = tmp / ow;
|
||||
int ox = tmp % ow;
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/2; index += blockDim.x * gridDim.x) {
|
||||
int oz_2, tmp2, oy, ox, tmp1, ob;
|
||||
d_oc.divmod(index, tmp1, oz_2);
|
||||
d_ow.divmod(tmp1, tmp2, ox);
|
||||
d_oh.divmod(tmp2, ob, oy);
|
||||
|
||||
int oz = oz_2 << 1;
|
||||
int ix = ox * sw - pw;
|
||||
int iy = oy * sh - ph;
|
||||
float color = bias[oz];
|
||||
float color0 = bias[oz];
|
||||
float color1 = bias[oz+1];
|
||||
|
||||
int fxSta = max(0, (UP_DIV(-ix, dw)));
|
||||
int fySta = max(0, (UP_DIV(-iy, dh)));
|
||||
int fxEnd = min(kw, UP_DIV(iw - ix, dw));
|
||||
|
|
@ -52,32 +56,190 @@ __global__ void CONV_DW(const T* input,
|
|||
int sy = fy*dh + iy;
|
||||
for (fx=fxSta; fx<fxEnd; ++fx) {
|
||||
int sx = fx*dw + ix;
|
||||
float inp = input[0
|
||||
+ sx * c_p
|
||||
+ sy * iw * c_p
|
||||
+ ob * iw * ih * c_p
|
||||
+ oz
|
||||
];
|
||||
float ker = kernel[0
|
||||
+ fx
|
||||
+ fy * kw
|
||||
+ oz * kw * kh
|
||||
];
|
||||
color = color + inp * ker;
|
||||
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
|
||||
float inp0 = input[src_offset];
|
||||
float inp1 = input[src_offset+1];
|
||||
|
||||
float ker0 = kernel[(fy * kw + fx) * c_p + oz];
|
||||
float ker1 = kernel[(fy * kw + fx) * c_p + oz + 1];
|
||||
|
||||
color0 = color0 + inp0 * ker0;
|
||||
color1 = color1 + inp1 * ker1;
|
||||
}
|
||||
}
|
||||
color = max(color, minV);
|
||||
color = min(color, maxV);
|
||||
color0 = max(color0, minV);
|
||||
color0 = min(color0, maxV);
|
||||
|
||||
output[0
|
||||
+ ox * c_p
|
||||
+ oy * ow * c_p
|
||||
+ ob * ow * oh * c_p
|
||||
+ oz
|
||||
] = color;
|
||||
color1 = max(color1, minV);
|
||||
color1 = min(color1, maxV);
|
||||
|
||||
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
|
||||
|
||||
output[dst_offset] = color0;
|
||||
output[dst_offset+1] = color1;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CONV_DW_HALF2_OPT(const half2* input,
|
||||
const half2* kernel,
|
||||
const half2* bias,
|
||||
half2 *output,
|
||||
const constBuffer* uConstant,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_ow,
|
||||
DivModFast d_oh
|
||||
) {
|
||||
float maxV = uConstant->maxValue;
|
||||
float minV = uConstant->minValue;
|
||||
int iw = uConstant->inputSize[0];
|
||||
int ih = uConstant->inputSize[1];
|
||||
int c = uConstant->channel;
|
||||
int c_p = c * PACK_NUMBER / 2;
|
||||
int ow = uConstant->outputSize[0];
|
||||
int oh = uConstant->outputSize[1];
|
||||
int kw = uConstant->kernelSize[0];
|
||||
int kh = uConstant->kernelSize[1];
|
||||
int sw = uConstant->stride[0];
|
||||
int sh = uConstant->stride[1];
|
||||
int pw = uConstant->pad[0];
|
||||
int ph = uConstant->pad[1];
|
||||
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/2; index += blockDim.x * gridDim.x) {
|
||||
int oz_2, tmp2, oy, ox, tmp1, ob;
|
||||
d_oc.divmod(index, tmp1, oz_2);
|
||||
d_ow.divmod(tmp1, tmp2, ox);
|
||||
d_oh.divmod(tmp2, ob, oy);
|
||||
|
||||
int oz = oz_2;
|
||||
int ix = ox * sw - pw;
|
||||
int iy = oy * sh - ph;
|
||||
half2 color = bias[oz];
|
||||
|
||||
int fxSta = max(0, -ix);
|
||||
int fySta = max(0, -iy);
|
||||
int fxEnd = min(kw, iw - ix);
|
||||
int fyEnd = min(kh, ih - iy);
|
||||
int fx, fy, fz;
|
||||
for (fy=fySta; fy<fyEnd; ++fy) {
|
||||
int sy = fy + iy;
|
||||
for (fx=fxSta; fx<fxEnd; ++fx) {
|
||||
int sx = fx + ix;
|
||||
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
|
||||
half2 inp = input[src_offset];
|
||||
half2 ker = kernel[(fy * kw + fx) * c_p + oz];
|
||||
|
||||
color = __hfma2(inp, ker, color);
|
||||
}
|
||||
}
|
||||
color.x = max(color.x, minV);
|
||||
color.x = min(color.x, maxV);
|
||||
|
||||
color.y = max(color.y, minV);
|
||||
color.y = min(color.y, maxV);
|
||||
|
||||
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
|
||||
output[dst_offset] = color;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void CONV_DW3x3_HALF2_OPT(const half2* input,
|
||||
const half2* kernel,
|
||||
const half2* bias,
|
||||
half2 *output,
|
||||
const constBuffer* uConstant,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_ow,
|
||||
DivModFast d_oh
|
||||
) {
|
||||
float maxV = uConstant->maxValue;
|
||||
float minV = uConstant->minValue;
|
||||
int iw = uConstant->inputSize[0];
|
||||
int ih = uConstant->inputSize[1];
|
||||
int c = uConstant->channel;
|
||||
int c_p = c * PACK_NUMBER / 2;
|
||||
int ow = uConstant->outputSize[0];
|
||||
int oh = uConstant->outputSize[1];
|
||||
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total/4; index += blockDim.x * gridDim.x) {
|
||||
int oz_2, tmp2, oy, ox_2, tmp1, ob;
|
||||
d_oc.divmod(index, tmp1, oz_2);
|
||||
d_ow.divmod(tmp1, tmp2, ox_2);
|
||||
d_oh.divmod(tmp2, ob, oy);
|
||||
|
||||
int oz = oz_2;
|
||||
int ox = ox_2 << 1;
|
||||
int ix = ox - 1;
|
||||
int iy = oy - 1;
|
||||
half2 color0 = bias[oz];
|
||||
half2 color1 = color0;
|
||||
|
||||
half2 zero;
|
||||
zero.x = (half)0.0;
|
||||
zero.y = (half)0.0;
|
||||
|
||||
half2 inp[12];
|
||||
half2 ker[3][3];
|
||||
for(int j=0; j<3; j++) {
|
||||
if(iy < 0 && j==0) {
|
||||
for(int i=0; i<4; i++) {
|
||||
inp[i] = zero;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(iy+2 > ih-1 && j==2) {
|
||||
for(int i=0; i<4; i++) {
|
||||
inp[8+i] = zero;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
for(int i=0; i<4; i++) {
|
||||
if(ix < 0 && i==0) {
|
||||
for(int j=0; j<3; j++) {
|
||||
inp[4*j+0] = zero;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(ix+3 > iw-1 && i==3) {
|
||||
for(int j=0; j<3; j++) {
|
||||
inp[4*j+3] = zero;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
int src_offset = ((ob * ih + iy+j) * iw + ix+i) * c_p + oz;
|
||||
inp[4*j+i] = input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
for(int j=0; j<3; j++) {
|
||||
for(int i=0; i<3; i++) {
|
||||
ker[j][i] = kernel[(j * 3 + i) * c_p + oz];
|
||||
}
|
||||
}
|
||||
|
||||
for(int j=0; j<3; j++) {
|
||||
for(int i=0; i<3; i++) {
|
||||
color0 = __hfma2(inp[4*j+i], ker[j][i], color0);
|
||||
color1 = __hfma2(inp[4*j+i+1], ker[j][i], color1);
|
||||
}
|
||||
}
|
||||
|
||||
color0.x = max(color0.x, minV);
|
||||
color0.x = min(color0.x, maxV);
|
||||
color0.y = max(color0.y, minV);
|
||||
color0.y = min(color0.y, maxV);
|
||||
|
||||
color1.x = max(color1.x, minV);
|
||||
color1.x = min(color1.x, maxV);
|
||||
color1.y = max(color1.y, minV);
|
||||
color1.y = min(color1.y, maxV);
|
||||
|
||||
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
|
||||
output[dst_offset] = color0;
|
||||
output[dst_offset+c_p] = color1;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void CONV_DW_OPT(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant,
|
||||
DivModFast d_oc,
|
||||
|
|
@ -88,6 +250,8 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
|
|||
float minV = uConstant->minValue;
|
||||
int iw = uConstant->inputSize[0];
|
||||
int ih = uConstant->inputSize[1];
|
||||
int ow = uConstant->outputSize[0];
|
||||
int oh = uConstant->outputSize[1];
|
||||
int kw = uConstant->kernelSize[0];
|
||||
int kh = uConstant->kernelSize[1];
|
||||
int sw = uConstant->stride[0];
|
||||
|
|
@ -97,15 +261,18 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
|
|||
int c = uConstant->channel;
|
||||
int c_p = c * PACK_NUMBER;
|
||||
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
|
||||
int oz, tmp2, oy, ox, tmp1, ob;
|
||||
d_oc.divmod(index, tmp1, oz);
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total / 2; index += blockDim.x * gridDim.x) {
|
||||
int oz_2, tmp2, oy, ox, tmp1, ob;
|
||||
d_oc.divmod(index, tmp1, oz_2);
|
||||
d_ow.divmod(tmp1, tmp2, ox);
|
||||
d_oh.divmod(tmp2, ob, oy);
|
||||
|
||||
int oz = oz_2 << 1;
|
||||
int ix = ox * sw - pw;
|
||||
int iy = oy * sh - ph;
|
||||
float color = bias[oz];
|
||||
float color0 = bias[oz];
|
||||
float color1 = bias[oz+1];
|
||||
|
||||
int fxSta = max(0, -ix);
|
||||
int fySta = max(0, -iy);
|
||||
int fxEnd = min(kw, iw - ix);
|
||||
|
|
@ -115,25 +282,28 @@ __global__ void CONV_DW_OPT(const float* input, const half* kernel, const half*
|
|||
int sy = fy + iy;
|
||||
for (fx=fxSta; fx<fxEnd; ++fx) {
|
||||
int sx = fx + ix;
|
||||
float inp = input[0
|
||||
+ sx * c_p
|
||||
+ sy * iw * c_p
|
||||
+ ob * iw * ih * c_p
|
||||
+ oz
|
||||
];
|
||||
float ker = kernel[0
|
||||
+ fx
|
||||
+ fy * kw
|
||||
+ oz * kw * kh
|
||||
];
|
||||
color = color + inp * ker;
|
||||
}
|
||||
}
|
||||
color = max(color, minV);
|
||||
color = min(color, maxV);
|
||||
int src_offset = ((ob * ih + sy) * iw + sx) * c_p + oz;
|
||||
float inp0 = input[src_offset];
|
||||
float inp1 = input[src_offset+1];
|
||||
|
||||
output[index] = color;
|
||||
}
|
||||
float ker0 = kernel[(fy * kw + fx) * c_p + oz];
|
||||
float ker1 = kernel[(fy * kw + fx) * c_p + oz + 1];
|
||||
|
||||
color0 = color0 + inp0 * ker0;
|
||||
color1 = color1 + inp1 * ker1;
|
||||
}
|
||||
}
|
||||
color0 = max(color0, minV);
|
||||
color0 = min(color0, maxV);
|
||||
|
||||
color1 = max(color1, minV);
|
||||
color1 = min(color1, maxV);
|
||||
|
||||
int dst_offset = ((ob * oh + oy) * ow + ox) * c_p + oz;
|
||||
|
||||
output[dst_offset] = color0;
|
||||
output[dst_offset+1] = color1;
|
||||
}
|
||||
}
|
||||
|
||||
static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op* op, Backend* bn) {
|
||||
|
|
@ -162,21 +332,22 @@ static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op*
|
|||
int weightSize = 0;
|
||||
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
|
||||
ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
|
||||
auto tempWeightStorage = pool->alloc(weightSize * sizeof(float));
|
||||
auto tempWeightStorage = pool->alloc(depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float));
|
||||
auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
|
||||
cuda_check(cudaMemset(tempWeight, 0, depthC * PACK_NUMBER * kernelY * kernelX * sizeof(float)));
|
||||
cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
|
||||
reg.size[0] = 1;
|
||||
reg.size[1] = depthC * PACK_NUMBER;
|
||||
reg.size[2] = kernelY * kernelX;
|
||||
reg.size[1] = kernelY * kernelX;
|
||||
reg.size[2] = depthC * PACK_NUMBER;
|
||||
reg.srcStride[0] = 0;
|
||||
reg.srcStride[1] = kernelY * kernelX;
|
||||
reg.srcStride[2] = 1;
|
||||
reg.srcStride[1] = 1;
|
||||
reg.srcStride[2] = kernelY * kernelX;
|
||||
reg.dstStride[0] = 0;
|
||||
reg.dstStride[1] = kernelY * kernelX;
|
||||
reg.dstStride[1] = depthC * PACK_NUMBER;
|
||||
reg.dstStride[2] = 1;
|
||||
offset[0] = 1;
|
||||
offset[1] = depth;
|
||||
offset[2] = kernelY * kernelX;
|
||||
offset[1] = kernelY * kernelX;
|
||||
offset[2] = depth;
|
||||
offset[3] = 0;
|
||||
offset[4] = 1;
|
||||
offset[5] = reg.size[1];
|
||||
|
|
@ -276,7 +447,7 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
|
|||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice);
|
||||
mTotalCount = parameters.total;
|
||||
//printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
|
||||
//MNN_PRINT("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -287,10 +458,31 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
int threads_num = ALIMIN(prop.maxThreadsPerBlock/2, limitThreads);
|
||||
int block_num = prop.multiProcessorCount;
|
||||
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
|
||||
|
||||
DivModFast d_oc(parameters.channel * PACK_NUMBER / 2);
|
||||
DivModFast d_ow(parameters.outputSize[0]);
|
||||
DivModFast d_oh(parameters.outputSize[1]);
|
||||
|
||||
if (static_cast<CUDABackend*>(backend())->useFp16()) {
|
||||
if (inputs.size() == 1) {
|
||||
if(parameters.kernelSize[0]==3 && parameters.kernelSize[1]==3 && parameters.stride[0]==1 && parameters.stride[1]==1 && parameters.pad[0]==1 && parameters.pad[1]==1 && parameters.outputSize[0] % 2 ==0) {
|
||||
DivModFast d_ow2(parameters.outputSize[0]/2);
|
||||
|
||||
CONV_DW3x3_HALF2_OPT<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
|
||||
(const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
|
||||
d_oc, d_ow2, d_oh);
|
||||
checkKernelErrors;
|
||||
return NO_ERROR;
|
||||
}
|
||||
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
|
||||
CONV_DW_HALF2_OPT<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
|
||||
(const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
|
||||
d_oc, d_ow, d_oh);//_HALF_OPT
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
CONV_DW<<<block_num, threads_num>>>((const half*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
|
||||
(const half*)mResource->mBias, (half*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
|
||||
(const half*)mResource->mBias, (half*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
@ -298,18 +490,16 @@ ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs,
|
|||
if (inputs.size() == 1) {
|
||||
// block_num = runtime->blocks_num(mTotalCount);
|
||||
// threads_num = runtime->threads_num();
|
||||
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
|
||||
const int area = parameters.outputSize[0] * parameters.outputSize[1];
|
||||
DivModFast d_oc(parameters.channel * PACK_NUMBER);
|
||||
DivModFast d_ow(parameters.outputSize[0]);
|
||||
DivModFast d_oh(parameters.outputSize[1]);
|
||||
|
||||
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
|
||||
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
|
||||
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
|
||||
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
|
||||
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
|
||||
d_oc, d_ow, d_oh);
|
||||
checkKernelErrors;
|
||||
}
|
||||
}
|
||||
return NO_ERROR;
|
||||
|
|
|
|||
|
|
@ -13,6 +13,9 @@
|
|||
#include "MNNCUDADefine.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
|
||||
#include "common/MemoryFormater.h"
|
||||
#include "backend/cuda/core/CUDATools.hpp"
|
||||
|
||||
// 16 / sizeof(int4)
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
|
@ -100,6 +103,8 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
|
|||
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber);
|
||||
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
|
||||
|
||||
|
||||
NVTX_PUSH("cuda_conv_weight");
|
||||
// Reorder weight
|
||||
{
|
||||
auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
|
||||
|
|
@ -121,9 +126,13 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
|
|||
}
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
|
||||
}
|
||||
NVTX_POP();
|
||||
|
||||
|
||||
// Copy Bias
|
||||
int biasSize = conv->bias()->size();
|
||||
|
||||
NVTX_PUSH("cuda_conv_bias");
|
||||
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
|
||||
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
|
||||
|
||||
|
|
@ -159,6 +168,7 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
|
|||
} else {
|
||||
FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4);
|
||||
}
|
||||
NVTX_POP();
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
|
||||
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
|
||||
|
|
@ -219,6 +229,7 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
mIm2ColParamter.srcYStep = input->width() * UNIT;
|
||||
mIm2ColParamter.packCUnit = UNIT;
|
||||
|
||||
// hostToDevice
|
||||
runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice);
|
||||
|
||||
//MNN_PRINT("conv size:%d-%d, %d-%d-%d, %d-%d-%d\n", mIm2ColParamter.kernelX, mIm2ColParamter.strideX, input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
|
||||
|
|
@ -267,7 +278,7 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
|
|||
auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)hPack);
|
||||
mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second);
|
||||
pool->free(buffer);
|
||||
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -289,6 +300,9 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
|
|||
auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second);
|
||||
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
|
||||
|
||||
|
||||
// MNN_PRINT("onExecute bytes is:%d mUseEPack:%d, mResource->mUseHPack:%d, useFp16:%d, mBlockNum:%d,3 dimPack:{ePack:%d, lPack:%d, hPack:%d}, mMatMulParam.elh:{%d,%d,%d}, elhMultiPack:{%d,%d,%d}\n",
|
||||
// bytes, mUseEPack, mResource->mUseHPack, static_cast<CUDABackend*>(backend())->useFp16(), mBlockNum, ePack, MATMULPACK, hPack, mMatMulParam.elh[0], mMatMulParam.elh[1], mMatMulParam.elh[2], mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]);
|
||||
// Im2col in Block
|
||||
for(int block_idx = 0; block_idx < mBlockNum; block_idx++) {
|
||||
if(mUseEPack) {
|
||||
|
|
@ -307,9 +321,11 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
// #define USE_MNN_CONV
|
||||
|
||||
class CUDAConvolutionCreator : public CUDABackend::Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const override {
|
||||
if (nullptr != op->main_as_Convolution2D()->quanParameter()) {
|
||||
auto quan = op->main_as_Convolution2D()->quanParameter();
|
||||
|
|
@ -321,19 +337,24 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef USE_MNN_CONV
|
||||
|
||||
std::shared_ptr<ConvSingleInputExecution::Resource> resource(new ConvSingleInputExecution::Resource(backend, op));
|
||||
return new ConvSingleInputExecution(backend, op, resource);
|
||||
|
||||
#else
|
||||
auto conv = op->main_as_Convolution2D()->common();
|
||||
if(ConvWinogradExecution::isValid(op->main_as_Convolution2D(), inputs[0])) {
|
||||
if(ConvWinogradExecution::isValid(op->main_as_Convolution2D())) { // inputs[0] is invalid now.
|
||||
//printf("%dx%ds%dd%d\n", conv->kernelX(), conv->kernelY(), conv->strideX(), conv->dilateX());
|
||||
|
||||
std::shared_ptr<ConvWinogradExecution::Resource> resource(new ConvWinogradExecution::Resource(backend, op));
|
||||
return new ConvWinogradExecution(backend, op, resource);
|
||||
}
|
||||
|
||||
// std::shared_ptr<ConvSingleInputExecution::Resource> resource(new ConvSingleInputExecution::Resource(backend, op));
|
||||
// return new ConvSingleInputExecution(backend, op, resource);
|
||||
|
||||
std::shared_ptr<ConvCutlassExecution::Resource> resource(new ConvCutlassExecution::Resource(backend, op));
|
||||
return new ConvCutlassExecution(backend, op, resource);
|
||||
#endif
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -15,24 +15,26 @@ namespace CUDA {
|
|||
#define UNIT 2
|
||||
__global__ void WinoWeightReorder(const float* GgGt,
|
||||
half* GgGt_trans,
|
||||
const int outside,
|
||||
const int block,
|
||||
const int co_pack,
|
||||
const int ci_pack,
|
||||
const int unitCi,
|
||||
const int unitCo
|
||||
) {
|
||||
const int maxCount = outside * unitCi * unitCo;
|
||||
const int maxCount = block * co_pack * ci_pack;
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
|
||||
size_t outside_idx = index / (unitCi*unitCo);
|
||||
size_t tmp = index % (unitCi*unitCo);
|
||||
size_t ci_idx = tmp / unitCo;
|
||||
size_t co_idx = tmp % unitCo;
|
||||
size_t tmp = index / ci_pack;
|
||||
size_t ci_idx = index % ci_pack;
|
||||
|
||||
// [4x4, Cop, Cip, unitCi, unitCo] -->> [4x4, Cop, Cip, unitCo, unitCi]
|
||||
size_t dst_idx = outside_idx * (unitCi*unitCo) + co_idx * unitCi + ci_idx;
|
||||
*(GgGt_trans + dst_idx) = *(GgGt + index);
|
||||
size_t block_idx = tmp / co_pack;
|
||||
size_t co_idx = tmp % co_pack;
|
||||
// [4x4, Cop, Cip, unitCi, unitCo] -->> [4x4, Cop*unitCo, Cip*unitCi]
|
||||
size_t src_idx = block_idx * (co_pack*ci_pack) + (co_idx/unitCo) * (ci_pack*unitCo) + (ci_idx/unitCi) * (unitCi*unitCo) + (ci_idx%unitCi) * unitCo + (co_idx%unitCo);
|
||||
*(GgGt_trans + index) = *(GgGt + src_idx);
|
||||
}
|
||||
}
|
||||
|
||||
bool ConvWinogradExecution::isValid(const Convolution2D* conv, const Tensor* input) {
|
||||
bool ConvWinogradExecution::isValid(const Convolution2D* conv) {
|
||||
//return false;
|
||||
if(conv->common()->strideX() != 1 || conv->common()->strideY() != 1) {
|
||||
return false;
|
||||
|
|
@ -74,7 +76,7 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
|
|||
std::shared_ptr<Tensor> srcWeight(Tensor::create<float>({mKernelInfo.kernelN, mKernelInfo.kernelC, mKernelInfo.kernelY, mKernelInfo.kernelX},
|
||||
(void *)filterDataPtr, Tensor::CAFFE));
|
||||
|
||||
auto dstWeight = generator.allocTransformWeight(srcWeight.get(), MATMULPACK, MATMULPACK);
|
||||
auto dstWeight = generator.allocTransformWeight(srcWeight.get(), PACK_NUMBER, PACK_NUMBER);
|
||||
generator.transformWeight(dstWeight.get(), srcWeight.get());
|
||||
auto dstWeightSize = dstWeight->elementSize();
|
||||
|
||||
|
|
@ -90,18 +92,18 @@ ConvWinogradExecution::Resource::Resource(Backend* backend, const MNN::Op* op) {
|
|||
int cores = prop.multiProcessorCount;
|
||||
int threadNumbers = prop.maxThreadsPerBlock;
|
||||
|
||||
int coDiv = UP_DIV(mKernelInfo.kernelN, MATMULPACK);
|
||||
int ciDiv = UP_DIV(mKernelInfo.kernelC, MATMULPACK);
|
||||
int coPack = UP_DIV(mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER;
|
||||
int ciPack = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER;
|
||||
|
||||
WinoWeightReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
|
||||
(UNIT+kernel-1) * (UNIT+kernel-1) * coDiv * ciDiv, MATMULPACK, MATMULPACK);
|
||||
(UNIT+kernel-1) * (UNIT+kernel-1), coPack, ciPack, PACK_NUMBER, PACK_NUMBER);
|
||||
|
||||
static_cast<CUDABackend*>(backend)->getStaticBufferPool()->free(tempCacheBuffer);
|
||||
}
|
||||
|
||||
// Copy Bias
|
||||
int biasSize = conv->bias()->size();
|
||||
int alignSize = UP_DIV(biasSize, MATMULPACK) * MATMULPACK;
|
||||
int alignSize = UP_DIV(biasSize, PACK_NUMBER) * PACK_NUMBER;
|
||||
biasTensor.reset(Tensor::createDevice<uint32_t>({alignSize}));
|
||||
backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
|
||||
|
||||
|
|
@ -148,23 +150,23 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
|
|||
mPadX = std::get<0>(pads);
|
||||
mPadY = std::get<1>(pads);
|
||||
int ic = input->channel();
|
||||
int icDiv = UP_DIV(ic, MATMULPACK);
|
||||
int icDiv = UP_DIV(ic, PACK_NUMBER);
|
||||
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
|
||||
|
||||
auto wUnit = UP_DIV(output->width(), UNIT);
|
||||
auto hUnit = UP_DIV(output->height(), UNIT);
|
||||
|
||||
int e = wUnit * hUnit * output->batch();
|
||||
int l = icDiv * MATMULPACK;
|
||||
int l = ic;
|
||||
int h = output->channel();
|
||||
mMatMulParam.elh[0] = e;
|
||||
mMatMulParam.elh[1] = l;
|
||||
mMatMulParam.elh[2] = h;
|
||||
|
||||
int ePack = MATMULPACK;
|
||||
int hPack = MATMULPACK;
|
||||
int ePack = PACK_NUMBER;
|
||||
int hPack = PACK_NUMBER;
|
||||
mMatMulParam.elhPack[0] = UP_DIV(e, ePack);
|
||||
mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK);
|
||||
mMatMulParam.elhPack[1] = UP_DIV(l, PACK_NUMBER);
|
||||
mMatMulParam.elhPack[2] = UP_DIV(h, hPack);
|
||||
// mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h;
|
||||
// mMatMulParam.cStride[1] = 1;
|
||||
|
|
@ -185,14 +187,87 @@ ErrorCode ConvWinogradExecution::onResize(const std::vector<Tensor*> &inputs, c
|
|||
int block = UNIT + convCommon->kernelY() - 1;
|
||||
mBlock2 = block * block;
|
||||
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
|
||||
auto buffer = pool->alloc((size_t)sizeof(__half) * mBlock2 * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)MATMULPACK);
|
||||
mBtdB_Buffer = (__half*)((uint8_t*)buffer.first + buffer.second);
|
||||
auto bufferData = pool->alloc((size_t)sizeof(__half) * mBlock2 * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * (size_t)ePack * (size_t)PACK_NUMBER);
|
||||
mBtdB_Buffer = (__half*)((uint8_t*)bufferData.first + bufferData.second);
|
||||
|
||||
auto buffer2 = pool->alloc(bytes * mBlock2 * mMatMulParam.elh[0] * mMatMulParam.elhPack[2] * (size_t)hPack);
|
||||
mMatmul_Buffer = (void*)((uint8_t*)buffer2.first + buffer2.second);
|
||||
auto bufferMatmul = pool->alloc(bytes * mBlock2 * mMatMulParam.elh[0] * mMatMulParam.elhPack[2] * (size_t)hPack);
|
||||
mMatmul_Buffer = (void*)((uint8_t*)bufferMatmul.first + bufferMatmul.second);
|
||||
|
||||
pool->free(buffer);
|
||||
pool->free(buffer2);
|
||||
pool->free(bufferData);
|
||||
pool->free(bufferMatmul);
|
||||
|
||||
|
||||
mGemmInfo.elh[0] = e;
|
||||
mGemmInfo.elh[1] = l;
|
||||
mGemmInfo.elh[2] = h;
|
||||
mGemmInfo.elhPad[0] = UP_DIV(e, 8) * 8;
|
||||
mGemmInfo.elhPad[1] = UP_DIV(l, 8) * 8;
|
||||
mGemmInfo.elhPad[2] = UP_DIV(h, 8) * 8;
|
||||
|
||||
ElementComputeEpilogue alpha = ElementComputeEpilogue(1);
|
||||
ElementComputeEpilogue beta = ElementComputeEpilogue(0);
|
||||
|
||||
// Split K dimension into 1 partitions
|
||||
cutlass::gemm::GemmCoord problem_size(mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);// m n k
|
||||
|
||||
//MNN_PRINT("Winograd BatchGemm batch:%d, MNK:%d-%d-%d\n", mBlock2, mGemmInfo.elh[0], mGemmInfo.elhPad[2], mGemmInfo.elhPad[1]);
|
||||
if(bytes == 2) {
|
||||
typename GemmBatched_F16_Linear_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInputA *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
|
||||
{(ElementInputB *)mResource->mFilter, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elhPad[2]), // batch_stride_B
|
||||
{(ElementOutput_F16 *)mResource->mBias, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F16 *)mMatmul_Buffer, mGemmInfo.elhPad[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBlock2}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatched_F16_Linear_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF16LnSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF16LnSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
|
||||
typename GemmBatched_F32_Linear_Sm75::Arguments arguments{problem_size, // <- problem size of matrix multiplication
|
||||
{(ElementInputA *)mBtdB_Buffer, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[1]), // batch_stride_A
|
||||
{(ElementInputB *)mResource->mFilter, mGemmInfo.elhPad[1]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elhPad[1] * mGemmInfo.elhPad[2]), // batch_stride_B
|
||||
{(ElementOutput_F32 *)mResource->mBias, 0}, // Ptr + ldm if ldm = 0, vector,
|
||||
(int64_t)(0), // batch_stride_bias
|
||||
{(ElementOutput_F32 *)mMatmul_Buffer, mGemmInfo.elhPad[2]}, // Ptr + ldm
|
||||
(int64_t)(mGemmInfo.elh[0] * mGemmInfo.elhPad[2]), // batch_stride_C
|
||||
{alpha, beta}, // <- tuple of alpha and beta
|
||||
mBlock2}; // batch_count
|
||||
|
||||
size_t workspace_size = GemmBatched_F32_Linear_Sm75::get_workspace_size(arguments);
|
||||
|
||||
auto bufferWs = pool->alloc(workspace_size * sizeof(uint8_t));
|
||||
mWorkspace = (uint8_t*)bufferWs.first + bufferWs.second;
|
||||
runtime->memset(mWorkspace, 0, workspace_size * sizeof(uint8_t));
|
||||
pool->free(bufferWs);
|
||||
|
||||
// Check the problem size is supported or not
|
||||
cutlass::Status status = mGemmBatchedF32LnSm75.can_implement(arguments);
|
||||
cutlass_check(status);
|
||||
|
||||
// Initialize CUTLASS kernel with arguments and workspace pointer
|
||||
status = mGemmBatchedF32LnSm75.initialize(arguments, (uint8_t *)mWorkspace);
|
||||
cutlass_check(status);
|
||||
}
|
||||
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
@ -205,8 +280,8 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
|
|||
int threadNumbers = prop.maxThreadsPerBlock / 2;
|
||||
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
|
||||
|
||||
int coDiv = UP_DIV(mResource->mKernelInfo.kernelN, MATMULPACK);
|
||||
int ciDiv = UP_DIV(mResource->mKernelInfo.kernelC, MATMULPACK);
|
||||
int co_pack = UP_DIV(mResource->mKernelInfo.kernelN, PACK_NUMBER) * PACK_NUMBER;
|
||||
int ci_pack = UP_DIV(mResource->mKernelInfo.kernelC, PACK_NUMBER) * PACK_NUMBER;
|
||||
|
||||
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
|
||||
const void *input_addr = (const void*)input->deviceId();
|
||||
|
|
@ -215,16 +290,23 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
|
|||
void *output_addr = (void*)output->deviceId();
|
||||
|
||||
const int kernel = 3;
|
||||
const int wUnit = UP_DIV(input->width(), UNIT);
|
||||
const int hUnit = UP_DIV(input->height(), UNIT);
|
||||
DivModFast lD(ci_pack);
|
||||
DivModFast hD(co_pack);
|
||||
DivModFast whD(wUnit * hUnit);
|
||||
DivModFast wD(wUnit);
|
||||
|
||||
if(bytes == 4) {
|
||||
WinoInputTrans<<<cores, threadNumbers>>>((const float*)input_addr, (half*)mBtdB_Buffer, UNIT,
|
||||
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ciDiv, output->batch(), UP_DIV(input->width(), UNIT),
|
||||
UP_DIV(input->height(), UNIT), MATMULPACK, MATMULPACK,
|
||||
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
|
||||
mMatMulParam.elh[0] * ci_pack, lD, whD, wD,
|
||||
mPadX, mPadY, input->width(), input->height());
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
WinoInputTrans<<<cores, threadNumbers>>>((const half*)input_addr, (half*)mBtdB_Buffer, UNIT,
|
||||
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ciDiv, output->batch(), UP_DIV(input->width(), UNIT),
|
||||
UP_DIV(input->height(), UNIT), MATMULPACK, MATMULPACK,
|
||||
(UNIT+kernel-1)*(UNIT+kernel-1), input->channel(), ci_pack,
|
||||
mMatMulParam.elh[0] * ci_pack, lD, whD, wD,
|
||||
mPadX, mPadY, input->width(), input->height());
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
|
@ -235,41 +317,27 @@ ErrorCode ConvWinogradExecution::onExecute(const std::vector<Tensor*> &inputs, c
|
|||
|
||||
int iBlock = 0;
|
||||
if (4 == bytes) {
|
||||
cudaFuncSetAttribute(GemmPackedMulti<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
|
||||
GemmPackedMulti<<<cores, threads_num, basicMemory>>>(gpuMatMul, iBlock, mBlock2, (float*)mMatmul_Buffer, mBtdB_Buffer, (const __half*)mGgGt_Buffer);
|
||||
checkKernelErrors;
|
||||
cutlass::Status status = mGemmBatchedF32LnSm75();
|
||||
cutlass_check(status);
|
||||
} else {
|
||||
//MNN_PRINT("%d - %d, %d- %d\n", cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elh[0], cpuParam->elh[2]);
|
||||
cudaFuncSetAttribute(GemmPackedMulti<half>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
|
||||
GemmPackedMulti<<<cores, threads_num, basicMemory>>>(gpuMatMul, iBlock, mBlock2, (half*)mMatmul_Buffer, mBtdB_Buffer, (const __half*)mGgGt_Buffer);
|
||||
checkKernelErrors;
|
||||
cutlass::Status status = mGemmBatchedF16LnSm75();
|
||||
cutlass_check(status);
|
||||
}
|
||||
|
||||
if (4 == bytes) {
|
||||
WinoTrans2Output<<<cores, threadNumbers>>>((const float*)mMatmul_Buffer, (const float*)bias_addr, (float*)output_addr,
|
||||
gpuMatMul, UNIT,
|
||||
mBlock2, output->channel(), ciDiv, output->batch(), UP_DIV(output->width(), UNIT),
|
||||
UP_DIV(output->height(), UNIT), MATMULPACK, MATMULPACK,
|
||||
gpuMatMul, UNIT, mBlock2, output->channel(), co_pack,
|
||||
mMatMulParam.elh[0] * co_pack, hD, whD, wD,
|
||||
output->width(), output->height());
|
||||
checkKernelErrors;
|
||||
} else {
|
||||
WinoTrans2Output<<<cores, threadNumbers>>>((const half*)mMatmul_Buffer, (const float*)bias_addr, (half*)output_addr,
|
||||
gpuMatMul, UNIT,
|
||||
mBlock2, output->channel(), ciDiv, output->batch(), UP_DIV(output->width(), UNIT),
|
||||
UP_DIV(output->height(), UNIT), MATMULPACK, MATMULPACK,
|
||||
gpuMatMul, UNIT, mBlock2, output->channel(), co_pack,
|
||||
mMatMulParam.elh[0] * co_pack, hD, whD, wD,
|
||||
output->width(), output->height());
|
||||
checkKernelErrors;
|
||||
}
|
||||
|
||||
|
||||
// if(output->width() == 56 && output->channel() == 64 && input->channel() == 64) {
|
||||
// cudaDeviceSynchronize();
|
||||
// float bias_[mMatMulParam.elhPack[2] * 16];
|
||||
// runtime->memcpy((void*)bias_, bias_addr, mMatMulParam.elhPack[2] * 16*sizeof(float), MNNMemcpyDeviceToHost);
|
||||
// for(int i=0; i<mMatMulParam.elhPack[2] * 16; i++) {
|
||||
// printf("%d-%f\n", i, bias_[i]);
|
||||
// }
|
||||
// }
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,9 @@
|
|||
|
||||
#include "ConvSingleInputExecution.hpp"
|
||||
#include "TensorCoreGemmPacked.cuh"
|
||||
#include "CutlassGemmParam.hpp"
|
||||
#include "MNNCUDADefine.hpp"
|
||||
#include "MNNCUDAFunction.cuh"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
|
@ -18,7 +21,7 @@ namespace CUDA {
|
|||
class ConvWinogradExecution : public Execution {
|
||||
public:
|
||||
struct Resource;
|
||||
static bool isValid(const Convolution2D* conv, const Tensor* input);
|
||||
static bool isValid(const Convolution2D* conv);
|
||||
ConvWinogradExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
|
||||
virtual ~ConvWinogradExecution();
|
||||
|
||||
|
|
@ -45,6 +48,13 @@ private:
|
|||
void* mMatmul_Buffer;
|
||||
MatMulParam mMatMulParam;
|
||||
std::pair<void*, int> mGpuMatMulParam;
|
||||
GemmBatched_F16_Linear_Sm75 mGemmBatchedF16LnSm75;
|
||||
GemmBatched_F32_Linear_Sm75 mGemmBatchedF32LnSm75;
|
||||
|
||||
std::shared_ptr<Tensor> workspaceTensor;
|
||||
uint8_t* mWorkspace;
|
||||
|
||||
CutlassGemmInfo mGemmInfo;
|
||||
|
||||
int mPadX;
|
||||
int mPadY;
|
||||
|
|
|
|||
|
|
@ -11,10 +11,17 @@
|
|||
#include "cutlass/epilogue/thread/linear_combination_relu.h"
|
||||
#include "cutlass/epilogue/thread/linear_combination_relu6.h"
|
||||
#include "cutlass/gemm/device/gemm.h"
|
||||
#include "cutlass/gemm/device/gemm_array.h"
|
||||
#include "cutlass/gemm/device/gemm_batched.h"
|
||||
|
||||
namespace MNN {
|
||||
namespace CUDA {
|
||||
|
||||
struct CutlassGemmInfo{
|
||||
int elh[3];
|
||||
int elhPad[3];
|
||||
};
|
||||
|
||||
using ElementAccumulator = float; // <- data type of accumulator
|
||||
using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations
|
||||
using ElementInputA = cutlass::half_t; // <- data type of elements in input matrix A
|
||||
|
|
@ -37,9 +44,9 @@ using SmArch75 = cutlass::arch::Sm75;
|
|||
|
||||
// This code section describes the tile size a thread block will compute
|
||||
using ShapeMMAThreadBlock =
|
||||
cutlass::gemm::GemmShape<64, 64, 32>; // <- threadblock tile M = 128, N = 256, K = 64
|
||||
cutlass::gemm::GemmShape<64, 64, 64>; // <- threadblock tile M = 128, N = 256, K = 64
|
||||
// This code section describes tile size a warp will compute
|
||||
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 32, 32>; // <- warp tile M = 64, N = 64, K = 64
|
||||
using ShapeMMAWarp = cutlass::gemm::GemmShape<32, 32, 64>; // <- warp tile M = 64, N = 64, K = 64
|
||||
// This code section describes the size of MMA op
|
||||
using ShapeMMAOp1688 = cutlass::gemm::GemmShape<16, 8, 8>; // <- MMA Op tile M = 8, N = 8, K = 16
|
||||
using ShapeMMAOp884 = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 16
|
||||
|
|
@ -300,6 +307,45 @@ using Gemm_F32_Relu6_Sm75 = cutlass::gemm::device::Gemm<ElementInputA,
|
|||
SwizzleThreadBlock,
|
||||
NumStages>;
|
||||
|
||||
// This code section describes how threadblocks are scheduled on GPU
|
||||
using BatchedSwizzleThreadBlock = cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle; // <- ??
|
||||
|
||||
using ShapeBatchMMAThreadBlock =
|
||||
cutlass::gemm::GemmShape<64, 64, 64>; // <- threadblock tile M = 128, N = 256, K = 64
|
||||
// This code section describes tile size a warp will compute
|
||||
using ShapeBatchMMAWarp = cutlass::gemm::GemmShape<16, 64, 64>; // <- warp tile M = 64, N = 64, K = 64
|
||||
|
||||
using GemmBatched_F16_Linear_Sm75 = cutlass::gemm::device::GemmBatched<ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput_F16,
|
||||
LayoutOutput,
|
||||
ElementAccumulator,
|
||||
MMAOp,
|
||||
SmArch75,
|
||||
ShapeBatchMMAThreadBlock,
|
||||
ShapeBatchMMAWarp,
|
||||
ShapeMMAOp1688,
|
||||
EpilogueOp_F16_Linear,
|
||||
BatchedSwizzleThreadBlock,
|
||||
NumStages>;
|
||||
|
||||
using GemmBatched_F32_Linear_Sm75 = cutlass::gemm::device::GemmBatched<ElementInputA,
|
||||
LayoutInputA,
|
||||
ElementInputB,
|
||||
LayoutInputB,
|
||||
ElementOutput_F32,
|
||||
LayoutOutput,
|
||||
ElementAccumulator,
|
||||
MMAOp,
|
||||
SmArch75,
|
||||
ShapeBatchMMAThreadBlock,
|
||||
ShapeBatchMMAWarp,
|
||||
ShapeMMAOp1688,
|
||||
EpilogueOp_F32_Linear,
|
||||
BatchedSwizzleThreadBlock,
|
||||
NumStages>;
|
||||
|
||||
} // namespace CUDA
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -536,7 +536,7 @@ __global__ void Binary##Name(\
|
|||
int sizeZ, int sizeY, int sizeX,\
|
||||
int strideZ, int strideY, int strideX,\
|
||||
int strideZ1, int strideY1, int strideX1,\
|
||||
int dstStrideZ, int dstStrideY, int dstStrideX\
|
||||
int dstStrideZ, int dstStrideY, int dstStrideX, int activationType\
|
||||
) { \
|
||||
int count = sizeZ * sizeY * sizeX;\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
|
||||
|
|
@ -550,7 +550,11 @@ __global__ void Binary##Name(\
|
|||
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
|
||||
TIn x = input0[srcOffset];\
|
||||
TIn y = input1[srcOffset1];\
|
||||
output[dstOffset] = (TOut)Func;\
|
||||
TOut val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset] = val;\
|
||||
}\
|
||||
}\
|
||||
|
||||
|
|
@ -561,7 +565,7 @@ __global__ void BinaryMid##Name(\
|
|||
int sizeZ, int sizeY, int sizeX,\
|
||||
int strideZ, int strideY, int strideX,\
|
||||
int strideZ1, int strideY1, int strideX1,\
|
||||
int dstStrideZ, int dstStrideY, int dstStrideX\
|
||||
int dstStrideZ, int dstStrideY, int dstStrideX, int activationType\
|
||||
) { \
|
||||
int count = sizeZ * sizeY * sizeX;\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
|
||||
|
|
@ -575,7 +579,11 @@ __global__ void BinaryMid##Name(\
|
|||
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
|
||||
float x = input0[srcOffset];\
|
||||
float y = input1[srcOffset1];\
|
||||
output[dstOffset] = (TOut)(Func);\
|
||||
TOut val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset] = val;\
|
||||
}\
|
||||
}\
|
||||
template<typename TIn, typename TOut>\
|
||||
|
|
@ -584,7 +592,8 @@ __global__ void BinaryMidLinear##Name(\
|
|||
int sizeZ,\
|
||||
int strideZ,\
|
||||
int strideZ1,\
|
||||
int dstStrideZ\
|
||||
int dstStrideZ,\
|
||||
int activationType\
|
||||
) { \
|
||||
int count = sizeZ;\
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
|
||||
|
|
@ -594,7 +603,11 @@ __global__ void BinaryMidLinear##Name(\
|
|||
int dstOffset = iz * dstStrideZ;\
|
||||
float x = input0[srcOffset];\
|
||||
float y = input1[srcOffset1];\
|
||||
output[dstOffset] = (TOut)(Func);\
|
||||
TOut val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset] = val;\
|
||||
}\
|
||||
}\
|
||||
|
||||
|
|
@ -602,7 +615,7 @@ __global__ void BinaryMidLinear##Name(\
|
|||
template<typename TIn, typename TOut>\
|
||||
__global__ void BinaryMidLinear4_##Name(\
|
||||
const TIn *input0, const TIn* input1, TOut *output,\
|
||||
int count_4\
|
||||
int count_4, int activationType\
|
||||
) { \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
|
||||
int iz = i;\
|
||||
|
|
@ -613,22 +626,38 @@ __global__ void BinaryMidLinear4_##Name(\
|
|||
float4 yy = ((float4 *)(input1+srcOffset1))[0];\
|
||||
float x = xx.x;\
|
||||
float y = yy.x;\
|
||||
output[dstOffset] = (TOut)(Func);\
|
||||
TOut val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset] = val;\
|
||||
x = xx.y;\
|
||||
y = yy.y;\
|
||||
output[dstOffset+1] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+1] = val;\
|
||||
x = xx.z;\
|
||||
y = yy.z;\
|
||||
output[dstOffset+2] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+2] = val;\
|
||||
x = xx.w;\
|
||||
y = yy.w;\
|
||||
output[dstOffset+3] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+3] = val;\
|
||||
}\
|
||||
}\
|
||||
template<typename TIn, typename TOut>\
|
||||
__global__ void BinaryMidLinearHalf4_##Name(\
|
||||
const TIn *input0, const TIn* input1, TOut *output,\
|
||||
int count_4\
|
||||
int count_4, int activationType\
|
||||
) { \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
|
||||
int iz = i;\
|
||||
|
|
@ -639,18 +668,34 @@ __global__ void BinaryMidLinearHalf4_##Name(\
|
|||
half2 yy = ((half2 *)(input1+srcOffset1))[0];\
|
||||
float x = (float)xx.x;\
|
||||
float y = (float)yy.x;\
|
||||
output[dstOffset] = (TOut)(Func);\
|
||||
TOut val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset] = val;\
|
||||
x = (float)xx.y;\
|
||||
y = (float)yy.y;\
|
||||
output[dstOffset+1] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+1] = val;\
|
||||
xx = ((half2 *)(input0+srcOffset))[1];\
|
||||
yy = ((half2 *)(input1+srcOffset1))[1];\
|
||||
x = (float)xx.x;\
|
||||
y = (float)yy.x;\
|
||||
output[dstOffset+2] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+2] = val;\
|
||||
x = (float)xx.y;\
|
||||
y = (float)yy.y;\
|
||||
output[dstOffset+3] = (TOut)(Func);\
|
||||
val = (TOut)(Func);\
|
||||
if(activationType == 1) {\
|
||||
val = (val < (TOut)0 ? (TOut)0 : val);\
|
||||
}\
|
||||
output[dstOffset+3] = val;\
|
||||
}\
|
||||
}\
|
||||
|
||||
|
|
@ -720,7 +765,7 @@ BINARY_FUNC_FLOATMID4(MOD, fmod(x, y));
|
|||
BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0);
|
||||
|
||||
template<typename T>
|
||||
void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
|
||||
void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType, int activationType) {
|
||||
int count = size[0] * size[1] * size[2];
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
|
|
@ -732,24 +777,25 @@ void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const i
|
|||
threads_num = runtime->threads_num();\
|
||||
if(bytes == 4) {\
|
||||
BinaryMidLinear4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
|
||||
count/4);\
|
||||
count/4, activationType);\
|
||||
} else {\
|
||||
BinaryMidLinearHalf4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
|
||||
count/4);\
|
||||
count/4, activationType);\
|
||||
}\
|
||||
} else {\
|
||||
BinaryMidLinear##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
|
||||
size[2],\
|
||||
srcStride[2],\
|
||||
srcStride1[2],\
|
||||
dstStride[2]);\
|
||||
dstStride[2],\
|
||||
activationType);\
|
||||
}\
|
||||
} else {\
|
||||
BinaryMid##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
|
||||
size[0], size[1], size[2],\
|
||||
srcStride[0], srcStride[1], srcStride[2],\
|
||||
srcStride1[0], srcStride1[1], srcStride1[2],\
|
||||
dstStride[0], dstStride[1], dstStride[2]);\
|
||||
dstStride[0], dstStride[1], dstStride[2], activationType);\
|
||||
}\
|
||||
return;\
|
||||
}\
|
||||
|
|
@ -777,7 +823,7 @@ void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const i
|
|||
#undef COMPUTE_FLOAT
|
||||
}
|
||||
|
||||
void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
|
||||
void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType, int activationType) {
|
||||
int count = size[0] * size[1] * size[2];
|
||||
int block_num = runtime->blocks_num(count);
|
||||
int threads_num = runtime->threads_num();
|
||||
|
|
@ -787,7 +833,7 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
|
|||
size[0], size[1], size[2],\
|
||||
srcStride[0], srcStride[1], srcStride[2],\
|
||||
srcStride1[0], srcStride1[1], srcStride1[2],\
|
||||
dstStride[0], dstStride[1], dstStride[2]);\
|
||||
dstStride[0], dstStride[1], dstStride[2], activationType);\
|
||||
return;\
|
||||
}\
|
||||
|
||||
|
|
@ -809,15 +855,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
|
|||
}
|
||||
|
||||
|
||||
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) {
|
||||
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType, int activationType) {
|
||||
if (type.code == halide_type_float) {
|
||||
if (type.bits == 32) {
|
||||
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
|
||||
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
} else if (type.bits == 16) {
|
||||
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
|
||||
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
}
|
||||
} else if (type.code == halide_type_int) {
|
||||
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
|
||||
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType, activationType);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ namespace CUDA {
|
|||
void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime, int offsetunit);
|
||||
void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime);
|
||||
void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType);
|
||||
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType);
|
||||
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType, int activationType = 0);
|
||||
|
||||
// Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst
|
||||
struct FuseRegion {
|
||||
|
|
|
|||
|
|
@ -145,6 +145,59 @@ static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Te
|
|||
return 1;
|
||||
}
|
||||
|
||||
static bool _equalSizeStride(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
|
||||
if (slice0.src.stride[0] != slice1.src.stride[0] || slice0.dst.stride[0] != slice1.dst.stride[0]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]);
|
||||
return false;
|
||||
}
|
||||
if (slice0.src.stride[1] != slice1.src.stride[1] || slice0.dst.stride[1] != slice1.dst.stride[1]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride1:%d-%d, , dst stride1:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[1], slice0.src.stride[1], slice.dst.stride[1], slice0.dst.stride[1]);
|
||||
return false;
|
||||
}
|
||||
if (slice0.src.stride[2] != slice1.src.stride[2] || slice0.dst.stride[2] != slice1.dst.stride[2]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride2:%d-%d, , dst stride2:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[2], slice0.src.stride[2], slice.dst.stride[2], slice0.dst.stride[2]);
|
||||
return false;
|
||||
}
|
||||
if (slice0.size[0] != slice1.size[0] || slice0.size[1] != slice1.size[1] || slice0.size[2] != slice1.size[2]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, copy size:%d-%d-%d, %d-%d-%d\n", mTempInputCopy.size(), i, slice.size[0], slice.size[1], slice.size[2], slice0.size[0], slice0.size[1], slice0.size[2]);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool _directBlitC4(const Tensor::InsideDescribe::Region& slice0, const Tensor::InsideDescribe::Region& slice1) {
|
||||
if(slice0.size[1] % PACK_NUMBER != 0 || slice0.size[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if(slice1.size[1] % PACK_NUMBER != 0 || slice1.size[0] != 1) {
|
||||
return false;
|
||||
}
|
||||
if(slice0.dst.offset % (slice0.size[1] * slice0.size[0]) != 0) {
|
||||
return false;
|
||||
}
|
||||
if(slice1.dst.offset % (slice1.size[1] * slice1.size[0]) != 0) {
|
||||
return false;
|
||||
}
|
||||
return _equalSizeStride(slice0, slice1);
|
||||
}
|
||||
|
||||
static void _turnToNewRegion(const Tensor::InsideDescribe::Region& region, Tensor::InsideDescribe::Region& newRegion, int multiStride) {
|
||||
newRegion.size[0] = region.size[0];
|
||||
newRegion.size[1] = region.size[2];
|
||||
newRegion.size[2] = region.size[1];
|
||||
|
||||
newRegion.src.stride[0] = region.src.stride[0];
|
||||
newRegion.src.stride[1] = region.src.stride[2] * region.size[1];
|
||||
newRegion.src.stride[2] = region.src.stride[1] / region.size[2];
|
||||
|
||||
newRegion.dst.stride[0] = region.dst.stride[0] * multiStride;
|
||||
newRegion.dst.stride[1] = region.dst.stride[2] * region.size[1] * multiStride;
|
||||
newRegion.dst.stride[2] = region.dst.stride[1] / region.size[2];
|
||||
|
||||
newRegion.src.offset = region.src.offset / region.size[2];
|
||||
newRegion.dst.offset = region.dst.offset / region.size[2];
|
||||
}
|
||||
|
||||
ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
MNN_ASSERT(inputs.size() == 1);
|
||||
MNN_ASSERT(outputs.size() == 1);
|
||||
|
|
@ -157,7 +210,52 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
mTempInput.clear();
|
||||
|
||||
mTempOutput = nullptr;
|
||||
mOutputPtr = output;
|
||||
mOutputPtr = output;
|
||||
|
||||
mFast = false;
|
||||
int pack = PACK_NUMBER;
|
||||
// all_srcFormat == dstFormat == NC4HW4 : Fast Exe
|
||||
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
mFast = true;
|
||||
auto& slice0 = des->regions[0];
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
//MNN_PRINT("%d-%d-%d, %d-%d-%d-%d\n", slice.size[0], slice.size[1], slice.size[2], slice.src.stride[1], slice.src.stride[2], slice.dst.stride[1], slice.dst.stride[2]);
|
||||
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
|
||||
mFast = false;
|
||||
break;
|
||||
}
|
||||
if(!_directBlitC4(slice0, slice)) {
|
||||
mFast = false;
|
||||
break;
|
||||
}
|
||||
if (!OpCommonUtils::canBlitFast(slice, output, pack, false, true)) {
|
||||
mFast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
//MNN_PRINT("raster fast:%d\n", mFast);
|
||||
if (mFast) {
|
||||
int multiStride = 1;
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
if(slice.dst.offset / (slice.size[0] * slice.size[1]) >= 1) {
|
||||
int batchChannel = slice.dst.offset / (slice.size[1] * slice.size[2]) + 1;
|
||||
multiStride = multiStride > batchChannel ? multiStride : batchChannel;
|
||||
}
|
||||
}
|
||||
for (int i=0; i< des->regions.size(); ++i) {
|
||||
auto& slice = des->regions[i];
|
||||
if (slice.origin == nullptr) {
|
||||
continue;
|
||||
}
|
||||
Tensor::InsideDescribe::Region newRegion;
|
||||
_turnToNewRegion(slice, newRegion, multiStride);
|
||||
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
|
||||
}
|
||||
return NO_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
mSingleConvert = 0;
|
||||
// srcNum == 1 && srcFormat != dstFormat : Single Convert
|
||||
|
|
@ -228,25 +326,8 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
//MNN_PRINT("Raster total:%d, index:%d, origin:%p-%p\n", mTempInputCopy.size(), i, mTempInputCopy[i].first, mTempInputCopy[0].first);
|
||||
break;
|
||||
}
|
||||
if (slice0.src.stride[0] != slice.src.stride[0] || slice0.dst.stride[0] != slice.dst.stride[0]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride0:%d-%d, , dst stride0:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[0], slice0.src.stride[0], slice.dst.stride[0], slice0.dst.stride[0]);
|
||||
if(!_equalSizeStride(slice0, slice)) {
|
||||
mFuseRaster.first = 0;
|
||||
break;
|
||||
}
|
||||
if (slice0.src.stride[1] != slice.src.stride[1] || slice0.dst.stride[1] != slice.dst.stride[1]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride1:%d-%d, , dst stride1:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[1], slice0.src.stride[1], slice.dst.stride[1], slice0.dst.stride[1]);
|
||||
mFuseRaster.first = 0;
|
||||
break;
|
||||
}
|
||||
if (slice0.src.stride[2] != slice.src.stride[2] || slice0.dst.stride[2] != slice.dst.stride[2]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, src stride2:%d-%d, , dst stride2:%d-%d\n", mTempInputCopy.size(), i, slice.src.stride[2], slice0.src.stride[2], slice.dst.stride[2], slice0.dst.stride[2]);
|
||||
mFuseRaster.first = 0;
|
||||
break;
|
||||
}
|
||||
if (slice0.size[0] != slice.size[0] || slice0.size[1] != slice.size[1] || slice0.size[2] != slice.size[2]) {
|
||||
//MNN_PRINT("Raster total:%d, index:%d, copy size:%d-%d-%d, %d-%d-%d\n", mTempInputCopy.size(), i, slice.size[0], slice.size[1], slice.size[2], slice0.size[0], slice0.size[1], slice0.size[2]);
|
||||
mFuseRaster.first = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -284,7 +365,29 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const s
|
|||
return NO_ERROR;
|
||||
}
|
||||
|
||||
void RasterExecution::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
|
||||
auto bn = static_cast<CUDABackend*>(backend());
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
auto bytes = bn->getBytes(output);
|
||||
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
|
||||
if (mNeedZero) {
|
||||
auto size = static_cast<CUDABackend*>(backend())->realSize(output) * bytes;
|
||||
cudaMemset((uint8_t*)output->deviceId(), 0, size);
|
||||
}
|
||||
// Use mFastBlit
|
||||
for (auto& iter : mFastBlit) {
|
||||
auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes;
|
||||
auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes;
|
||||
RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes, runtime);
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
if (mFast) {
|
||||
executeFaster(inputs, outputs);
|
||||
return NO_ERROR;
|
||||
}
|
||||
auto bn = static_cast<CUDABackend*>(backend());
|
||||
auto input = inputs[0];
|
||||
auto output = outputs[0];
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ __global__ void GemmPackedFull(const MatMulParam* param, const int iBlock, T *c,
|
|||
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
|
||||
b_frag;
|
||||
wmma::fragment<wmma::accumulator, 16, 16, 16, T> acc_frag;
|
||||
|
||||
|
||||
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
size_t subIndex = index / warpSize;
|
||||
size_t warpM = subIndex % eU;
|
||||
|
|
@ -39,9 +39,6 @@ __global__ void GemmPackedFull(const MatMulParam* param, const int iBlock, T *c,
|
|||
//printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements);
|
||||
// MLA
|
||||
for (size_t i = 0; i < lU; ++i) {
|
||||
half* aTemp = ((half *)(aStart+i*256));//aStart + (i << 8) + (laneId << 1);
|
||||
half* bTemp = ((half *)(bStart+i*256));//bStart + (i << 8) + (laneId << 1);
|
||||
|
||||
wmma::load_matrix_sync(a_frag, aStart + i * 256, 16);
|
||||
wmma::load_matrix_sync(b_frag, bStart + i * 256, 16);
|
||||
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
|
||||
|
|
@ -99,7 +96,10 @@ __global__ void GemmPackedFull16x32(const MatMulParam* param, const int iBlock,
|
|||
size_t wrapId = threadIdx.x / warpSize;
|
||||
size_t laneId = threadIdx.x % warpSize;
|
||||
extern __shared__ float sharedMemory[];
|
||||
T* cache = (T*)(sharedMemory + wrapId * 16 * 32);
|
||||
constexpr int row = 17;
|
||||
constexpr int conflict_free_size = 24;
|
||||
constexpr int offset = 16;
|
||||
T* cache = (T*)(sharedMemory + wrapId * conflict_free_size * (16 + row));
|
||||
for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) {
|
||||
size_t warpM = index % eU;
|
||||
size_t warpN = index / eU;
|
||||
|
|
@ -143,18 +143,18 @@ __global__ void GemmPackedFull16x32(const MatMulParam* param, const int iBlock,
|
|||
T* dst0 = (T*)(c + warpN * 32 * (size_t)param->elh[0] + eSta * 16);
|
||||
T* dst1 = (T*)(c + (warpN * 32 + 16) * (size_t)param->elh[0] + eSta * 16);
|
||||
// First 8x32
|
||||
wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major);
|
||||
wmma::store_matrix_sync(cache, MC0, conflict_free_size, wmma::mem_row_major);
|
||||
// Second 8x32
|
||||
wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major);
|
||||
wmma::store_matrix_sync(cache + 16 * conflict_free_size + offset, MC1, conflict_free_size, wmma::mem_row_major);
|
||||
auto dst = dst0;
|
||||
auto src = cache;
|
||||
if (laneId >= 16) {
|
||||
dst = dst1;
|
||||
src = cache + 256;
|
||||
src = cache + 16 * conflict_free_size + offset;
|
||||
}
|
||||
size_t x = laneId % 16;
|
||||
for (size_t y = 0; y < eC; ++y) {
|
||||
dst[y * 16 + x] = src[y * 16 + x];
|
||||
dst[y * 16 + x] = src[y * conflict_free_size + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -169,7 +169,10 @@ __global__ void GemmPackedFull32x16(const MatMulParam* param, const int iBlock,
|
|||
size_t wrapId = threadIdx.x / warpSize;
|
||||
size_t laneId = threadIdx.x % warpSize;
|
||||
extern __shared__ float sharedMemory[];
|
||||
T* cache = (T*)(sharedMemory + wrapId * 32 * 16);
|
||||
constexpr int row = 17;
|
||||
constexpr int conflict_free_size = 24;
|
||||
constexpr int offset = 16;
|
||||
T* cache = (T*)(sharedMemory + wrapId * (16 + row) * conflict_free_size);
|
||||
for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) {
|
||||
size_t warpN = index % hU;
|
||||
size_t warpM = index / hU;
|
||||
|
|
@ -216,18 +219,18 @@ __global__ void GemmPackedFull32x16(const MatMulParam* param, const int iBlock,
|
|||
T* dst0 = (T*)(c + warpN * 16 * (size_t)param->elh[0] + eSta * 16);
|
||||
T* dst1 = (T*)(dst0 + 256);
|
||||
// First 8x32
|
||||
wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major);
|
||||
wmma::store_matrix_sync(cache, MC0, conflict_free_size, wmma::mem_row_major);
|
||||
// Second 8x32
|
||||
wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major);
|
||||
wmma::store_matrix_sync(cache + 16 * conflict_free_size + offset, MC1, conflict_free_size, wmma::mem_row_major);
|
||||
auto dst = dst0;
|
||||
auto src = cache;
|
||||
if (laneId >= 16) {
|
||||
dst = dst1;
|
||||
src = cache + 256;
|
||||
src = cache + 16 * conflict_free_size + offset;
|
||||
}
|
||||
size_t x = laneId % 16;
|
||||
for (size_t y = 0; y < eC; ++y) {
|
||||
dst[y * 16 + x] = src[y * 16 + x];
|
||||
dst[y * 16 + x] = src[y * conflict_free_size + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -261,10 +264,14 @@ void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const Ma
|
|||
{
|
||||
int hUP = cpuParam->elhPack[2];
|
||||
int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * hUP, cores);
|
||||
int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize);
|
||||
constexpr int max_threadblock = 512;
|
||||
int threads_num = ALIMIN(max_threadblock, maxThreadInWarp * prop.warpSize);
|
||||
//MNN_PRINT("GemmPacked16x32:%d-%d-%d-%d-%d\n\n", hUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num);
|
||||
threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num);
|
||||
int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize);
|
||||
constexpr int row = 17;
|
||||
constexpr int conflict_free_size = 24;
|
||||
int basicMemory = (16 + row) * conflict_free_size * sizeof(float) * (threads_num / prop.warpSize);
|
||||
// MNN_PRINT("GemmPacked16x32 basicMemory byte size:%d\n", basicMemory);
|
||||
if (4 == bytes) {
|
||||
cudaFuncSetAttribute(GemmPackedFull16x32<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
|
||||
GemmPackedFull16x32<<<cores, threads_num, basicMemory>>>(param, iBlock, (float*)c, a, b, (float*)biasPtr);
|
||||
|
|
@ -284,10 +291,13 @@ void GemmPacked32x16(CUDARuntime* runtime, const MatMulParam* cpuParam, const Ma
|
|||
{
|
||||
int eUP = cpuParam->elhPack[0];
|
||||
int maxThreadInWarp = UP_DIV(eUP * cpuParam->elhPack[2], cores);
|
||||
int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize);
|
||||
constexpr int max_threadblock = 512;
|
||||
int threads_num = ALIMIN(max_threadblock, maxThreadInWarp * prop.warpSize);
|
||||
//MNN_PRINT("GemmPacked32x16:%d-%d-%d-%d-%d\n\n", eUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num);
|
||||
threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num);
|
||||
int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize);
|
||||
constexpr int row = 17;
|
||||
constexpr int conflict_free_size = 24;
|
||||
int basicMemory = (16 + row) * conflict_free_size * sizeof(float) * (threads_num / prop.warpSize);
|
||||
if (4 == bytes) {
|
||||
cudaFuncSetAttribute(GemmPackedFull32x16<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
|
||||
GemmPackedFull32x16<<<cores, threads_num, basicMemory>>>(param, iBlock, (float*)c, a, b, (float*)biasPtr);
|
||||
|
|
|
|||
|
|
@ -292,19 +292,65 @@ __global__ void NCHW_2_NHWC8(const T0* input,
|
|||
const int maxCount,
|
||||
const int channel,
|
||||
const int area,
|
||||
const int channel_pack
|
||||
const int channel_pack,
|
||||
DivModFast d_ocp,
|
||||
DivModFast d_area
|
||||
) {
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
int chnlp_idx = index % channel_pack;
|
||||
int temp = index / channel_pack;
|
||||
int area_idx = temp % area;
|
||||
int batch_idx = temp / area;
|
||||
int area_idx, temp, chnlp_idx, batch_idx;
|
||||
d_ocp.divmod(index, temp, chnlp_idx);
|
||||
d_area.divmod(temp, batch_idx, area_idx);
|
||||
|
||||
if(chnlp_idx >= channel) {
|
||||
output[index] = (T1)0.0f;
|
||||
continue;
|
||||
}
|
||||
output[index] = (T1)input[(batch_idx * channel + chnlp_idx) * area + area_idx];
|
||||
int src_offset = (batch_idx * channel + chnlp_idx) * area + area_idx;
|
||||
output[index] = (T1)input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T0, typename T1>
|
||||
__global__ void NCHW_2_NHWC(const T0* input,
|
||||
T1* output,
|
||||
const int maxCount,
|
||||
const int channel,
|
||||
const int area,
|
||||
const int channel_pack,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_area
|
||||
) {
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
int area_idx, temp, chnl_idx, batch_idx;
|
||||
d_oc.divmod(index, temp, chnl_idx);
|
||||
d_area.divmod(temp, batch_idx, area_idx);
|
||||
|
||||
int src_offset = (batch_idx * channel + chnl_idx) * area + area_idx;
|
||||
output[index] = (T1)input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T0, typename T1>
|
||||
__global__ void NHWC_2_NHWC8(const T0* input,
|
||||
T1* output,
|
||||
const int maxCount,
|
||||
const int channel,
|
||||
const int area,
|
||||
const int channel_pack,
|
||||
DivModFast d_ocp,
|
||||
DivModFast d_area
|
||||
) {
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
int area_idx, temp, chnlp_idx, batch_idx;
|
||||
d_ocp.divmod(index, temp, chnlp_idx);
|
||||
d_area.divmod(temp, batch_idx, area_idx);
|
||||
|
||||
if(chnlp_idx >= channel) {
|
||||
output[index] = (T1)0.0f;
|
||||
continue;
|
||||
}
|
||||
int src_offset = (batch_idx * area + area_idx) * channel + chnlp_idx;
|
||||
output[index] = (T1)input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -314,17 +360,40 @@ __global__ void NHWC8_2_NCHW(const T0* input,
|
|||
const int maxCount,
|
||||
const int channel,
|
||||
const int area,
|
||||
const int channel_pack
|
||||
const int channel_pack,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_area
|
||||
) {
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
int area_idx = index % area;
|
||||
int temp = index / area;
|
||||
int channel_idx = temp % channel;
|
||||
int batch_idx = temp / channel;
|
||||
|
||||
output[index] = (T1)input[(batch_idx * area + area_idx) * channel_pack + channel_idx];
|
||||
int area_idx, temp, channel_idx, batch_idx;
|
||||
d_area.divmod(index, temp, area_idx);
|
||||
d_oc.divmod(temp, batch_idx, channel_idx);
|
||||
|
||||
int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
|
||||
output[index] = (T1)input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T0, typename T1>
|
||||
__global__ void NHWC8_2_NHWC(const T0* input,
|
||||
T1* output,
|
||||
const int maxCount,
|
||||
const int channel,
|
||||
const int area,
|
||||
const int channel_pack,
|
||||
DivModFast d_oc,
|
||||
DivModFast d_area
|
||||
) {
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
|
||||
|
||||
int area_idx, temp, channel_idx, batch_idx;
|
||||
d_oc.divmod(index, temp, channel_idx);
|
||||
d_area.divmod(temp, batch_idx, area_idx);
|
||||
|
||||
int src_offset = (batch_idx * area + area_idx) * channel_pack + channel_idx;
|
||||
output[index] = (T1)input[src_offset];
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T0, typename T1>
|
||||
|
|
@ -383,11 +452,25 @@ __global__ void NHWC8_2_C4NHW4(const T0* input,
|
|||
template<class T0, class T1>
|
||||
static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFormat, MNN_DATA_FORMAT dstDataFormat, CUDARuntime* runtime, \
|
||||
const int area, const int batch, const int channel) {
|
||||
DivModFast d_oc(channel);
|
||||
DivModFast d_ocp(UP_DIV(channel, 8) * 8);
|
||||
DivModFast d_area(area);
|
||||
|
||||
if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
|
||||
const int block_num = runtime->blocks_num(maxCount);
|
||||
const int block_size = runtime->threads_num();
|
||||
NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8);
|
||||
NCHW_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
|
||||
d_ocp, d_area);
|
||||
checkKernelErrors;
|
||||
return;
|
||||
}
|
||||
if(srcDataFormat == MNN_DATA_FORMAT_NHWC && dstDataFormat == MNN_DATA_FORMAT_NC4HW4) {
|
||||
const int maxCount = batch * area * UP_DIV(channel, 8) * 8;
|
||||
const int block_num = runtime->blocks_num(maxCount);
|
||||
const int block_size = runtime->threads_num();
|
||||
NHWC_2_NHWC8<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
|
||||
d_ocp, d_area);
|
||||
checkKernelErrors;
|
||||
return;
|
||||
}
|
||||
|
|
@ -404,7 +487,26 @@ static void insideFormatConvert(T0* input, T1* output, MNN_DATA_FORMAT srcDataFo
|
|||
const int maxCount = batch * area * channel;
|
||||
const int block_num = runtime->blocks_num(maxCount);
|
||||
const int block_size = runtime->threads_num();
|
||||
NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8);
|
||||
NHWC8_2_NCHW<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
|
||||
d_oc, d_area);
|
||||
checkKernelErrors;
|
||||
return;
|
||||
}
|
||||
if(srcDataFormat == MNN_DATA_FORMAT_NC4HW4 && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
|
||||
const int maxCount = batch * area * channel;
|
||||
const int block_num = runtime->blocks_num(maxCount);
|
||||
const int block_size = runtime->threads_num();
|
||||
NHWC8_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
|
||||
d_oc, d_area);
|
||||
checkKernelErrors;
|
||||
return;
|
||||
}
|
||||
if(srcDataFormat == MNN_DATA_FORMAT_NCHW && dstDataFormat == MNN_DATA_FORMAT_NHWC) {
|
||||
const int maxCount = batch * area * channel;
|
||||
const int block_num = runtime->blocks_num(maxCount);
|
||||
const int block_size = runtime->threads_num();
|
||||
NCHW_2_NHWC<T0, T1><<<block_num, block_size>>>(input, output, maxCount, channel, area, UP_DIV(channel, 8) * 8,
|
||||
d_oc, d_area);
|
||||
checkKernelErrors;
|
||||
return;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,45 +16,29 @@ __global__ void WinoInputTrans(const T* input,
|
|||
const int unit,
|
||||
const int block,
|
||||
const int ci,
|
||||
const int ciDiv,
|
||||
const int batch,
|
||||
const int wLen,
|
||||
const int hLen,
|
||||
const int ePack,
|
||||
const int lPack,
|
||||
const int ci_p8,
|
||||
const int maxCount,
|
||||
DivModFast lD,
|
||||
DivModFast whD,
|
||||
DivModFast wD,
|
||||
const int pad_x,
|
||||
const int pad_y,
|
||||
const int width,
|
||||
const int height
|
||||
) {
|
||||
const int eU = (batch * wLen * hLen + 15) / 16;
|
||||
const int lU = ciDiv;
|
||||
const int eP = eU * ePack;
|
||||
const int lP = lU * lPack;
|
||||
const int maxCount = eP * lP;
|
||||
const int l = ci_p8;
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
|
||||
const int eU_idx = index / (lP * ePack);
|
||||
const int tmp1 = index % (lP * ePack);
|
||||
const int lU_idx = tmp1 / (ePack * lPack);
|
||||
const int tmp2 = tmp1 % (ePack * lPack);
|
||||
const int ePack_idx = tmp2 / lPack;
|
||||
const int lPack_idx = tmp2 % lPack;
|
||||
|
||||
const int e_idx = eU_idx * ePack + ePack_idx;
|
||||
const int batch_idx = e_idx / (wLen * hLen);
|
||||
const int tmp3 = e_idx % (wLen * hLen);
|
||||
const int w_idx = tmp3 % wLen;
|
||||
const int h_idx = tmp3 / wLen;
|
||||
int e_idx, ci_idx, batch_idx, tmp, w_idx, h_idx;
|
||||
lD.divmod(index, e_idx, ci_idx);
|
||||
whD.divmod(e_idx, batch_idx, tmp);
|
||||
wD.divmod(tmp, h_idx, w_idx);
|
||||
|
||||
const int sxStart = w_idx * unit - pad_x;
|
||||
const int syStart = h_idx * unit - pad_y;
|
||||
|
||||
T S00, S10, S20, S30, S01, S11, S21, S31, S02, S12, S22, S32, S03, S13, S23, S33;
|
||||
|
||||
int ci_p8 = ((ci + 7) / 8) * 8;
|
||||
int ci_idx = lU_idx * lPack + lPack_idx;
|
||||
int inp_offset = ((batch_idx * height + syStart) * width + sxStart) * ci_p8 + ci_idx;
|
||||
//(((lU_idx * batch + batch_idx) * height + syStart) * width + sxStart) * 16 + lPack_idx;
|
||||
{
|
||||
int sx = 0 + sxStart;
|
||||
int sy = 0 + syStart;
|
||||
|
|
@ -74,14 +58,14 @@ __global__ void WinoInputTrans(const T* input,
|
|||
int sy = 0 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S20 = outBound ? (T)(0) : input[inp_offset+2*ci_p8];
|
||||
S20 = outBound ? (T)(0) : input[inp_offset+ci_p8+ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 3 + sxStart;
|
||||
int sy = 0 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S30 = outBound ? (T)(0) : input[inp_offset+3*ci_p8];
|
||||
S30 = outBound ? (T)(0) : input[inp_offset+ci_p8+ci_p8+ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 0 + sxStart;
|
||||
|
|
@ -116,85 +100,85 @@ __global__ void WinoInputTrans(const T* input,
|
|||
int sy = 2 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S02 = outBound ? (T)(0) : input[inp_offset+(2*width+0)*ci_p8];
|
||||
S02 = outBound ? (T)(0) : input[inp_offset+(width+width+0)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 1 + sxStart;
|
||||
int sy = 2 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S12 = outBound ? (T)(0) : input[inp_offset+(2*width+1)*ci_p8];
|
||||
S12 = outBound ? (T)(0) : input[inp_offset+(width+width+1)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 2 + sxStart;
|
||||
int sy = 2 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S22 = outBound ? (T)(0) : input[inp_offset+(2*width+2)*ci_p8];
|
||||
S22 = outBound ? (T)(0) : input[inp_offset+(width+width+2)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 3 + sxStart;
|
||||
int sy = 2 + syStart;
|
||||
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S32 = outBound ? (T)(0) : input[inp_offset+(2*width+3)*ci_p8];
|
||||
S32 = outBound ? (T)(0) : input[inp_offset+(width+width+3)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 0 + sxStart;
|
||||
int sy = 3 + syStart;
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S03 = outBound ? (T)(0) : input[inp_offset+(3*width+0)*ci_p8];
|
||||
S03 = outBound ? (T)(0) : input[inp_offset+(width+width+width+0)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 1 + sxStart;
|
||||
int sy = 3 + syStart;
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S13 = outBound ? (T)(0) : input[inp_offset+(3*width+1)*ci_p8];
|
||||
S13 = outBound ? (T)(0) : input[inp_offset+(width+width+width+1)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 2 + sxStart;
|
||||
int sy = 3 + syStart;
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S23 = outBound ? (T)(0) : input[inp_offset+(3*width+2)*ci_p8];
|
||||
S23 = outBound ? (T)(0) : input[inp_offset+(width+width+width+2)*ci_p8];
|
||||
}
|
||||
{
|
||||
int sx = 3 + sxStart;
|
||||
int sy = 3 + syStart;
|
||||
bool outBound = (sx < 0 || sx >= width || sy < 0 || sy >= height || ci_idx >= ci);
|
||||
S33 = outBound ? (T)(0) : input[inp_offset+(3*width+3)*ci_p8];
|
||||
S33 = outBound ? (T)(0) : input[inp_offset+(width+width+width+3)*ci_p8];
|
||||
}
|
||||
T m00 = +S00 - S02;
|
||||
T m10 = +S10 - S12;
|
||||
T m20 = +S20 - S22;
|
||||
T m30 = +S30 - S32;
|
||||
T m01 = +(T)0.5f * S01 + (T)0.5f * S02;
|
||||
T m11 = +(T)0.5f * S11 + (T)0.5f * S12;
|
||||
T m21 = +(T)0.5f * S21 + (T)0.5f * S22;
|
||||
T m31 = +(T)0.5f * S31 + (T)0.5f * S32;
|
||||
T m02 = -(T)0.5f * S01 + (T)0.5f * S02;
|
||||
T m12 = -(T)0.5f * S11 + (T)0.5f * S12;
|
||||
T m22 = -(T)0.5f * S21 + (T)0.5f * S22;
|
||||
T m32 = -(T)0.5f * S31 + (T)0.5f * S32;
|
||||
T m01 = +(T)0.5f * (S01 + S02);
|
||||
T m11 = +(T)0.5f * (S11 + S12);
|
||||
T m21 = +(T)0.5f * (S21 + S22);
|
||||
T m31 = +(T)0.5f * (S31 + S32);
|
||||
T m02 = +(T)0.5f * (-S01 + S02);
|
||||
T m12 = +(T)0.5f * (-S11 + S12);
|
||||
T m22 = +(T)0.5f * (-S21 + S22);
|
||||
T m32 = +(T)0.5f * (-S31 + S32);
|
||||
T m03 = -S01 + S03;
|
||||
T m13 = -S11 + S13;
|
||||
T m23 = -S21 + S23;
|
||||
T m33 = -S31 + S33;
|
||||
|
||||
BtdB[0*maxCount + index] = +m00 - m20;
|
||||
BtdB[1*maxCount + index] = +(T)0.5f * m10 + (T)0.5f * m20;
|
||||
BtdB[2*maxCount + index] = -(T)0.5f * m10 + (T)0.5f * m20;
|
||||
BtdB[1*maxCount + index] = +(T)0.5f * (m10 + m20);
|
||||
BtdB[2*maxCount + index] = +(T)0.5f * (-m10 + m20);
|
||||
BtdB[3*maxCount + index] = -m10 + m30;
|
||||
BtdB[4*maxCount + index] = +m01 - m21;
|
||||
BtdB[5*maxCount + index] = +(T)0.5f * m11 + (T)0.5f * m21;
|
||||
BtdB[6*maxCount + index] = -(T)0.5f * m11 + (T)0.5f * m21;
|
||||
BtdB[5*maxCount + index] = +(T)0.5f * (m11 + m21);
|
||||
BtdB[6*maxCount + index] = +(T)0.5f * (-m11 + m21);
|
||||
BtdB[7*maxCount + index] = -m11 + m31;
|
||||
BtdB[8*maxCount + index] = +m02 - m22;
|
||||
BtdB[9*maxCount + index] = +(T)0.5f * m12 + (T)0.5f * m22;
|
||||
BtdB[10*maxCount + index] = -(T)0.5f * m12 + (T)0.5f * m22;
|
||||
BtdB[9*maxCount + index] = +(T)0.5f * (m12 + m22);
|
||||
BtdB[10*maxCount + index] = +(T)0.5f * (-m12 + m22);
|
||||
BtdB[11*maxCount + index] = -m12 + m32;
|
||||
BtdB[12*maxCount + index] = +m03 - m23;
|
||||
BtdB[13*maxCount + index] = +(T)0.5f * m13 + (T)0.5f * m23;
|
||||
BtdB[14*maxCount + index] = -(T)0.5f * m13 + (T)0.5f * m23;
|
||||
BtdB[13*maxCount + index] = +(T)0.5f * (m13 + m23);
|
||||
BtdB[14*maxCount + index] = +(T)0.5f * (-m13 + m23);
|
||||
BtdB[15*maxCount + index] = -m13 + m33;
|
||||
}
|
||||
}
|
||||
|
|
@ -208,48 +192,38 @@ __global__ void WinoTrans2Output(const T* matmulData,
|
|||
const int unit,
|
||||
const int block,
|
||||
const int co,
|
||||
const int ciDiv,
|
||||
const int batch,
|
||||
const int wLen,
|
||||
const int hLen,
|
||||
const int ePack,
|
||||
const int hPack,
|
||||
const int co_p8,
|
||||
const int maxCount,
|
||||
DivModFast hD,
|
||||
DivModFast whD,
|
||||
DivModFast wD,
|
||||
const int width,
|
||||
const int height
|
||||
) {
|
||||
size_t e = param->elh[0];
|
||||
size_t hU = param->elhPack[2];
|
||||
const size_t maxCount = e * hU * hPack;
|
||||
|
||||
const int h = co_p8;
|
||||
for(size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += gridDim.x * blockDim.x) {
|
||||
|
||||
const int hU_idx = index / (e * hPack);
|
||||
const int tmp1 = index % (e * hPack);
|
||||
const int e_idx = tmp1 / hPack;
|
||||
const int hPack_idx = tmp1 % hPack;
|
||||
int e_idx, co_idx, batch_idx, tmp, w_idx, h_idx;
|
||||
hD.divmod(index, e_idx, co_idx);
|
||||
whD.divmod(e_idx, batch_idx, tmp);
|
||||
wD.divmod(tmp, h_idx, w_idx);
|
||||
|
||||
const int batch_idx = e_idx / (wLen * hLen);
|
||||
const int tmp3 = e_idx % (wLen * hLen);
|
||||
const int w_idx = tmp3 % wLen;
|
||||
const int h_idx = tmp3 / wLen;
|
||||
int basic_offset = (hU_idx * e + e_idx) * hPack + hPack_idx;
|
||||
|
||||
auto S00 = matmulData[basic_offset];
|
||||
auto S10 = matmulData[basic_offset + maxCount];
|
||||
auto S20 = matmulData[basic_offset + maxCount * 2];
|
||||
auto S30 = matmulData[basic_offset + maxCount * 3];
|
||||
auto S01 = matmulData[basic_offset + maxCount * 4];
|
||||
auto S11 = matmulData[basic_offset + maxCount * 5];
|
||||
auto S21 = matmulData[basic_offset + maxCount * 6];
|
||||
auto S31 = matmulData[basic_offset + maxCount * 7];
|
||||
auto S02 = matmulData[basic_offset + maxCount * 8];
|
||||
auto S12 = matmulData[basic_offset + maxCount * 9];
|
||||
auto S22 = matmulData[basic_offset + maxCount * 10];
|
||||
auto S32 = matmulData[basic_offset + maxCount * 11];
|
||||
auto S03 = matmulData[basic_offset + maxCount * 12];
|
||||
auto S13 = matmulData[basic_offset + maxCount * 13];
|
||||
auto S23 = matmulData[basic_offset + maxCount * 14];
|
||||
auto S33 = matmulData[basic_offset + maxCount * 15];
|
||||
auto S00 = matmulData[index];
|
||||
auto S10 = matmulData[index + maxCount];
|
||||
auto S20 = matmulData[index + maxCount * 2];
|
||||
auto S30 = matmulData[index + maxCount * 3];
|
||||
auto S01 = matmulData[index + maxCount * 4];
|
||||
auto S11 = matmulData[index + maxCount * 5];
|
||||
auto S21 = matmulData[index + maxCount * 6];
|
||||
auto S31 = matmulData[index + maxCount * 7];
|
||||
auto S02 = matmulData[index + maxCount * 8];
|
||||
auto S12 = matmulData[index + maxCount * 9];
|
||||
auto S22 = matmulData[index + maxCount * 10];
|
||||
auto S32 = matmulData[index + maxCount * 11];
|
||||
auto S03 = matmulData[index + maxCount * 12];
|
||||
auto S13 = matmulData[index + maxCount * 13];
|
||||
auto S23 = matmulData[index + maxCount * 14];
|
||||
auto S33 = matmulData[index + maxCount * 15];
|
||||
|
||||
auto m00 = +S00 + S01 + S02;
|
||||
auto m10 = +S10 + S11 + S12;
|
||||
|
|
@ -261,18 +235,15 @@ __global__ void WinoTrans2Output(const T* matmulData,
|
|||
auto m31 = +S31 - S32 + S33;
|
||||
|
||||
// write output
|
||||
float bias = biasData[hU_idx * hPack + hPack_idx];
|
||||
float bias = biasData[co_idx];
|
||||
|
||||
const int dxStart = w_idx * unit;
|
||||
const int dyStart = h_idx * unit;
|
||||
|
||||
int co_p8 = ((co + 7) / 8) * 8;
|
||||
int co_idx = hU_idx * hPack + hPack_idx;
|
||||
if(co_idx >= co_p8) {
|
||||
continue;
|
||||
}
|
||||
int out_offset = ((batch_idx * height + dyStart) * width + dxStart) * co_p8 + co_idx;
|
||||
//(((hU_idx * batch + batch_idx) * height + dyStart) * width + dxStart) * 16 + hPack_idx;
|
||||
|
||||
/* if true */ {
|
||||
float res = bias + (float)(m00 + m10 + m20);
|
||||
|
|
|
|||
|
|
@ -16,56 +16,129 @@ namespace MNN {
|
|||
|
||||
void NPUBinary::OpInsert(int binary_type, string opName,
|
||||
ge::Operator& input0, ge::Operator& input1,
|
||||
const std::vector<Tensor *> &outputs){
|
||||
const std::vector<Tensor *> &outputs, int activationType){
|
||||
|
||||
if(binary_type == BinaryOpOperation_ADD) {
|
||||
shared_ptr<ge::op::Add> binary(new ge::op::Add(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_MUL) {
|
||||
shared_ptr<ge::op::Mul> binary(new ge::op::Mul(opName));
|
||||
(*binary)
|
||||
.set_input_x(input0)
|
||||
.set_input_y(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_REALDIV) {
|
||||
shared_ptr<ge::op::RealDiv> binary(new ge::op::RealDiv(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_SUB) {
|
||||
shared_ptr<ge::op::Sub> binary(new ge::op::Sub(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_MINIMUM) {
|
||||
shared_ptr<ge::op::Minimum> binary(new ge::op::Minimum(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_MAXIMUM) {
|
||||
shared_ptr<ge::op::Maximum> binary(new ge::op::Maximum(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_EQUAL) {
|
||||
shared_ptr<ge::op::Equal> binary(new ge::op::Equal(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
} else if(binary_type == BinaryOpOperation_LESS_EQUAL) {
|
||||
shared_ptr<hiai::op::LessEqual> binary(new hiai::op::LessEqual(opName));
|
||||
(*binary)
|
||||
.set_input_x1(input0)
|
||||
.set_input_x2(input1);
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
if(activationType == 1) {
|
||||
shared_ptr<ge::op::Activation> binary_activation(new ge::op::Activation(opName + "_Relu"));
|
||||
(*binary_activation)
|
||||
.set_input_x(*binary.get())
|
||||
.set_attr_mode(1);
|
||||
|
||||
mNpuBackend->setOutputOps(mOp, {binary, binary_activation}, outputs);
|
||||
} else {
|
||||
mNpuBackend->setOutputOps(mOp, {binary}, outputs);
|
||||
}
|
||||
}else{
|
||||
MNN_ERROR("npu binary not support type : %d \n", binary_type);
|
||||
MNN_ASSERT(false);
|
||||
|
|
@ -149,21 +222,21 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
|
|||
|
||||
vector<pair<shared_ptr<ge::Operator>, string>> ops;
|
||||
auto binary_type = mOp->main_as_BinaryOp()->opType();
|
||||
|
||||
int activationType = mOp->main_as_BinaryOp()->activationType();
|
||||
if(!isConst0 && isConst1){
|
||||
//
|
||||
auto inputIndex0 = mOp->inputIndexes()->data()[0];
|
||||
auto iops0 = mNpuBackend->mGrapMap[inputIndex0]; // x
|
||||
auto xOp0 = iops0.back().first;
|
||||
|
||||
OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs);
|
||||
OpInsert(binary_type, opName, *xOp0.get(), mConst, outputs, activationType);
|
||||
}else if(isConst0 && !isConst1){
|
||||
//
|
||||
auto inputIndex1 = mOp->inputIndexes()->data()[1];
|
||||
auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x
|
||||
auto xOp1 = iops1.back().first;
|
||||
|
||||
OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs);
|
||||
OpInsert(binary_type, opName, mConst, *xOp1.get(), outputs, activationType);
|
||||
|
||||
}else{
|
||||
|
||||
|
|
@ -177,7 +250,7 @@ ErrorCode NPUBinary::onResize(const std::vector<Tensor *> &inputs, const std::ve
|
|||
auto iops1 = mNpuBackend->mGrapMap[inputIndex1]; // x
|
||||
auto xOp1 = iops1.back().first;
|
||||
|
||||
OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs);
|
||||
OpInsert(binary_type, opName, *xOp0.get(), *xOp1.get(), outputs, activationType);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ class NPUBinary : public NPUCommonExecution {
|
|||
public:
|
||||
void OpInsert(int binary_type, string opName,
|
||||
ge::Operator& input0, ge::Operator& input1,
|
||||
const std::vector<Tensor *> &outputs);
|
||||
const std::vector<Tensor *> &outputs, int activationType);
|
||||
NPUBinary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NPUBinary() = default;
|
||||
|
|
|
|||
|
|
@ -2580,7 +2580,7 @@ const char* shader_MetalBinary_metal =
|
|||
" int i0stride;\n"
|
||||
" int i1stride;\n"
|
||||
" int output_data_count;\n"
|
||||
" int output_width;\n"
|
||||
" int activationType;\n"
|
||||
"};\n"
|
||||
"#define define_op(op) "
|
||||
"kernel void binary_##op##_x1(const device M *in0 [[buffer(0)]],"
|
||||
|
|
@ -2591,7 +2591,11 @@ const char* shader_MetalBinary_metal =
|
|||
" if ((int)gid >= s.output_data_count) return;"
|
||||
" auto V0=in0[s.i0stride*int(gid)];"
|
||||
" auto V1=in1[s.i1stride*int(gid)];"
|
||||
" out[int(gid)]=op(V0,V1);"
|
||||
" auto val=op(V0,V1);"
|
||||
" if(s.activationType == 1) {"
|
||||
" val=(val<(M)0 ? (M)0 : val);"
|
||||
" }"
|
||||
" out[int(gid)]=val;"
|
||||
"}\n"
|
||||
"static inline M add(M V1,M V2) {\n"
|
||||
" return V1+V2;\n"
|
||||
|
|
|
|||
|
|
@ -56,6 +56,9 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
printf("Error Key = %s\n", iter.first.c_str());
|
||||
NSLog(@"Warning: Metallib Library error: %@", err);
|
||||
}
|
||||
[libraryMap removeAllObjects];
|
||||
libraryMap = nil;
|
||||
return;
|
||||
}
|
||||
auto functionNames = [library functionNames];
|
||||
for(int i=0; i<functionNames.count ; i++) {
|
||||
|
|
@ -102,7 +105,8 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
|
|||
_caches = [NSMutableDictionary dictionary];
|
||||
_waitings = [NSMutableArray array];
|
||||
_isCommitEachShader = self.class.commit_frequent;
|
||||
return nil != _library;
|
||||
|
||||
return (0 != [_library count]);
|
||||
}
|
||||
|
||||
- (instancetype)init {
|
||||
|
|
|
|||
|
|
@ -553,6 +553,7 @@ int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTen
|
|||
if (toCpu) {
|
||||
[ctx wait];
|
||||
}
|
||||
mFrameEncodeCache = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -872,7 +873,7 @@ void registerMetalRuntimeCreator() {
|
|||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
if (nil != device) {
|
||||
registerMetalOps();
|
||||
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator(device), false);
|
||||
MNNInsertExtraRuntimeCreator(MNN_FORWARD_METAL, new MetalRuntimeCreator(device), true);
|
||||
} else {
|
||||
MNN_ERROR("Init Metal Error\n");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ namespace MNN {
|
|||
|
||||
class MetalBinary : public Execution {
|
||||
public:
|
||||
MetalBinary(Backend *backend, std::string type);
|
||||
MetalBinary(Backend *backend, std::string type, const MNN::Op *op);
|
||||
virtual ~MetalBinary() = default;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
|
|
@ -26,6 +26,7 @@ private:
|
|||
id<MTLBuffer> mConstBuffer;
|
||||
id<MTLComputePipelineState> mPipeline;
|
||||
std::pair<MTLSize, MTLSize> mThreads;
|
||||
int mActivationType = 0;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
|
|
|
|||
|
|
@ -14,13 +14,14 @@
|
|||
#if MNN_METAL_ENABLED
|
||||
namespace MNN {
|
||||
|
||||
MetalBinary::MetalBinary(Backend *backend, std::string type) : Execution(backend) {
|
||||
MetalBinary::MetalBinary(Backend *backend, std::string type, const MNN::Op *op) : Execution(backend) {
|
||||
auto mKernelName = "binary_" + type + "_x1";
|
||||
auto mtbn = static_cast<MetalBackend *>(backend);
|
||||
auto context = (__bridge MNNMetalContext *)mtbn->context();
|
||||
mConstBuffer = [context newDeviceBuffer:4 * sizeof(int) access:CPUWriteOnly];
|
||||
auto kn = [NSString stringWithCString:mKernelName.c_str() encoding:[NSString defaultCStringEncoding]];
|
||||
mPipeline = [context pipelineWithName:kn];
|
||||
mActivationType = op->main_as_BinaryOp()->activationType();
|
||||
}
|
||||
ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto backend = static_cast<MetalBackend *>(this->backend());
|
||||
|
|
@ -33,7 +34,7 @@ ErrorCode MetalBinary::onResize(const std::vector<Tensor *> &inputs, const std::
|
|||
((int *)mConstBuffer.contents)[0] = input0_data_count == 1 ? 0 : 1;
|
||||
((int *)mConstBuffer.contents)[1] = input1_data_count == 1 ? 0 : 1;
|
||||
((int *)mConstBuffer.contents)[2] = outdatacount;
|
||||
((int *)mConstBuffer.contents)[3] = 0;
|
||||
((int *)mConstBuffer.contents)[3] = mActivationType;
|
||||
mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake(outdatacount, 1, 1)];
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
|
@ -94,7 +95,7 @@ public:
|
|||
FUNC_PRINT(binaryop->opType());
|
||||
return nullptr;
|
||||
}
|
||||
return new MetalBinary(backend, type);
|
||||
return new MetalBinary(backend, type, op);
|
||||
}
|
||||
};
|
||||
REGISTER_METAL_OP_CREATOR(MetalBinaryCreator, OpType_BinaryOp);
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ struct binary_op_shape {
|
|||
int i0stride;
|
||||
int i1stride;
|
||||
int output_data_count;
|
||||
int output_width;
|
||||
int activationType;
|
||||
};
|
||||
#define define_op(op) \
|
||||
kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\
|
||||
|
|
@ -13,7 +13,11 @@ kernel void binary_##op##_x1(const device ftype *in0 [[buffer(0)]],\
|
|||
if ((int)gid >= s.output_data_count) return;\
|
||||
auto value0 = in0[s.i0stride * int(gid)];\
|
||||
auto value1 = in1[s.i1stride * int(gid)];\
|
||||
out[int(gid)] = op(value0, value1);\
|
||||
auto val = op(value0, value1);\
|
||||
if(s.activationType == 1) {\
|
||||
val = (val < (ftype)0 ? (ftype)0 : val);\
|
||||
}\
|
||||
out[int(gid)] = val;\
|
||||
}
|
||||
|
||||
static inline ftype add(ftype value1, ftype value2) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
# Android NN API
|
||||
file(GLOB MNN_NNAPI_SRCS
|
||||
${CMAKE_CURRENT_LIST_DIR}/backend/*.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/backend/*.hpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/execution/*.cpp
|
||||
${CMAKE_CURRENT_LIST_DIR}/execution/*.hpp
|
||||
)
|
||||
|
||||
add_library(
|
||||
MNN_NNAPI
|
||||
STATIC
|
||||
${MNN_NNAPI_SRCS}
|
||||
)
|
||||
|
||||
target_compile_options(MNN_NNAPI PRIVATE -DMNN_NNAPI_ENABLED=1)
|
||||
|
||||
target_include_directories(MNN_NNAPI PRIVATE
|
||||
${CMAKE_CURRENT_LIST_DIR}/backend
|
||||
${CMAKE_CURRENT_LIST_DIR}/execution
|
||||
)
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
//
|
||||
// NNAPIBackend.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2021/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "backend/cpu/CPUTensorConvert.hpp"
|
||||
#include <core/Macro.h>
|
||||
#include <core/TensorUtils.hpp>
|
||||
#include <stdlib.h>
|
||||
#include <mutex>
|
||||
#include <MNN/AutoTime.hpp>
|
||||
// #define NNAPI_DEBUG
|
||||
// #define USE_NCHW
|
||||
|
||||
#define CHECK(func, ...) \
|
||||
do { \
|
||||
const auto _status = (func(__VA_ARGS__)); \
|
||||
if (_status != ANEURALNETWORKS_NO_ERROR) { \
|
||||
const auto ENUM_TO_STR = NNAPIEnumToString(_status); \
|
||||
MNN_PRINT("[NNAPI] Error: %s when call " #func " at line %d.\n", \
|
||||
ENUM_TO_STR.c_str(), __LINE__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
namespace MNN {
|
||||
void registerNNAPIOps();
|
||||
static inline std::map<OpType, NNAPIBackend::Creator*>* getCreatorMap() {
|
||||
static std::once_flag of;
|
||||
static std::map<OpType, NNAPIBackend::Creator*>* ret = nullptr;
|
||||
std::call_once(of, [&]() { ret = new std::map<OpType, NNAPIBackend::Creator*>; });
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string NNAPIEnumToString(int code) {
|
||||
switch (code) {
|
||||
#define ENUM_TO_STR(code) case ANEURALNETWORKS_##code: return #code
|
||||
// ResultCode begin
|
||||
ENUM_TO_STR(NO_ERROR);
|
||||
ENUM_TO_STR(OUT_OF_MEMORY);
|
||||
ENUM_TO_STR(INCOMPLETE);
|
||||
ENUM_TO_STR(UNEXPECTED_NULL);
|
||||
ENUM_TO_STR(BAD_DATA);
|
||||
ENUM_TO_STR(OP_FAILED);
|
||||
ENUM_TO_STR(BAD_STATE);
|
||||
ENUM_TO_STR(UNMAPPABLE);
|
||||
ENUM_TO_STR(OUTPUT_INSUFFICIENT_SIZE);
|
||||
ENUM_TO_STR(UNAVAILABLE_DEVICE);
|
||||
// ResultCode end
|
||||
default:
|
||||
return "UNKNOWN_ENUM";
|
||||
#undef ENUM_TO_STR
|
||||
}
|
||||
}
|
||||
bool NNAPIBackend::addCreator(OpType t, Creator* c) {
|
||||
auto map = getCreatorMap();
|
||||
if (map->find(t) != map->end()) {
|
||||
MNN_PRINT("Error: %d type has be added\n", t);
|
||||
return false;
|
||||
}
|
||||
map->insert(std::make_pair(t, c));
|
||||
return true;
|
||||
}
|
||||
|
||||
NNAPIBackend::NNAPIBackend(const NNAPIRuntime* runtime) : Backend(MNN_FORWARD_NN) {
|
||||
mNPURuntime = runtime;
|
||||
mPrecision = mNPURuntime->mPrecision;
|
||||
#ifdef USE_NCHW
|
||||
mNCHW = true;
|
||||
#else
|
||||
mNCHW = false;
|
||||
#endif
|
||||
MNN_PRINT("[NNAPI] DimensionFormat is %s\n", mNCHW ? "NCHW" : "NHWC");
|
||||
if (mNNAPIModel == nullptr) {
|
||||
CHECK(ANeuralNetworksModel_create_27, &mNNAPIModel);
|
||||
}
|
||||
if (mNNAPIDevices.empty()) {
|
||||
uint32_t numDevices = 0;
|
||||
CHECK(ANeuralNetworks_getDeviceCount_29, &numDevices);
|
||||
mNNAPIDevices.resize(numDevices);
|
||||
MNN_PRINT("[NNAPI] numDevices = %d\n", numDevices);
|
||||
for (int i = 0; i < numDevices; i++) {
|
||||
CHECK(ANeuralNetworks_getDevice_29, i, &mNNAPIDevices[i].device);
|
||||
CHECK(ANeuralNetworksDevice_getName_29, mNNAPIDevices[i].device, &mNNAPIDevices[i].name);
|
||||
CHECK(ANeuralNetworksDevice_getType_29, mNNAPIDevices[i].device, &mNNAPIDevices[i].type);
|
||||
MNN_PRINT("[NNAPI] device %d is : %s, %d\n", i, mNNAPIDevices[i].name, mNNAPIDevices[i].type);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NNAPIBackend::~NNAPIBackend() {
|
||||
ANeuralNetworksCompilation_free_27(mNNAPICompilation);
|
||||
ANeuralNetworksModel_free_27(mNNAPIModel);
|
||||
}
|
||||
|
||||
Execution* NNAPIBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) {
|
||||
auto map = getCreatorMap();
|
||||
auto iter = map->find(op->type());
|
||||
if (iter == map->end()) {
|
||||
MNN_PRINT("[NNAPI] Don't support type %s.\n", MNN::EnumNameOpType(op->type()));
|
||||
return nullptr;
|
||||
}
|
||||
auto exe = iter->second->onCreate(inputs, outputs, op, this);
|
||||
if (nullptr == exe) {
|
||||
MNN_PRINT("[NNAPI] The Creator Don't support type %s.\n", MNN::EnumNameOpType(op->type()));
|
||||
return nullptr;
|
||||
}
|
||||
return exe;
|
||||
}
|
||||
|
||||
void NNAPIBackend::NNAPIBackend::onExecuteBegin() const {
|
||||
}
|
||||
|
||||
void NNAPIBackend::onExecuteEnd() const {
|
||||
invokeModel();
|
||||
}
|
||||
|
||||
Backend::MemObj* NNAPIBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
|
||||
bool isInputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
|
||||
bool isOutputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
|
||||
std::unique_ptr<Tensor> tensor_(new Tensor(tensor, mNCHW ? Tensor::DimensionType::CAFFE : Tensor::DimensionType::TENSORFLOW, true));
|
||||
if(isInputCopy){
|
||||
mInputTensors.push_back(tensor);
|
||||
mInputContentTensors.push_back(std::move(tensor_));
|
||||
mInputIdxMap.insert(std::make_pair(tensor, mInputIdxMap.size()));
|
||||
}
|
||||
if(isOutputCopy){
|
||||
mOutputTensors.push_back(tensor);
|
||||
mOutputContentTensors.push_back(std::move(tensor_));
|
||||
mOutputIdxMap.insert(std::make_pair(tensor, mOutputIdxMap.size()));
|
||||
// TensorUtils::getDescribe(tensor)->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
|
||||
// const_cast<halide_buffer_t&>(tensor->buffer()).host = (uint8_t*)MNNMemoryAllocAlign(tensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
|
||||
// MNN_ASSERT(tensor->buffer().host != nullptr);
|
||||
}
|
||||
getTensorIdx(tensor);
|
||||
// Don't need release
|
||||
return new Backend::MemObj;
|
||||
}
|
||||
|
||||
bool NNAPIBackend::onClearBuffer() {
|
||||
mInputContentTensors.clear();
|
||||
mOutputContentTensors.clear();
|
||||
mInputTensors.clear();
|
||||
mOutputTensors.clear();
|
||||
mInputIdxMap.clear();
|
||||
mOutputIdxMap.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
void NNAPIBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
|
||||
bool isInputCopy = TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
|
||||
bool isOutputCopy = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
|
||||
bool isConst = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT || TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT;
|
||||
|
||||
if(isConst){ return; }
|
||||
|
||||
if (isInputCopy) {
|
||||
const auto iter = mInputIdxMap.find(dstTensor);
|
||||
MNN_ASSERT(iter != mInputIdxMap.end());
|
||||
// memcpy((void*)&mInputTensors[iter->second], &srcTensor, sizeof(void*));
|
||||
auto code = CPUTensorConverter::convert(srcTensor, mInputContentTensors[iter->second].get());
|
||||
if (NO_ERROR != code) {
|
||||
MNN_ERROR("Error in NNAPIBackend::onCopyBuffer:convert\n");
|
||||
}
|
||||
} else if (isOutputCopy) {
|
||||
const auto iter = mOutputIdxMap.find(srcTensor);
|
||||
MNN_ASSERT(iter != mOutputIdxMap.end());
|
||||
// memcpy(dstTensor->host<void>(), srcTensor->host<void>(), std::min(srcTensor->size(), dstTensor->size()));
|
||||
auto code = CPUTensorConverter::convert(mOutputContentTensors[iter->second].get(), dstTensor);
|
||||
if (NO_ERROR != code) {
|
||||
MNN_ERROR("Error in NNAPIBackend::onCopyBuffer:convert\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void NNAPIBackend::onResizeBegin() {
|
||||
}
|
||||
|
||||
void NNAPIBackend::onResizeEnd() {
|
||||
buildModel();
|
||||
}
|
||||
|
||||
uint32_t NNAPIBackend::getTensorIdx(const Tensor* t) {
|
||||
const auto& iter = mTensorIdxMap.find(t);
|
||||
if (iter != mTensorIdxMap.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
std::vector<uint32_t> dims;
|
||||
for (auto d : t->shape()) {
|
||||
dims.push_back(d);
|
||||
}
|
||||
std::vector<uint32_t> udims(dims.begin(), dims.end());
|
||||
if (TensorUtils::getDescribe(t)->dimensionFormat != MNN_DATA_FORMAT_NHWC && !mNCHW) {
|
||||
// NCHW -> NHWC
|
||||
udims[0] = dims[0];
|
||||
udims[1] = dims[2];
|
||||
udims[2] = dims[3];
|
||||
udims[3] = dims[1];
|
||||
}
|
||||
uint32_t idx = buildOperand(nullptr, 0, ANEURALNETWORKS_TENSOR_FLOAT32, udims);
|
||||
mTensorIdxMap.insert(std::make_pair(t, idx));
|
||||
return idx;
|
||||
}
|
||||
uint32_t NNAPIBackend::buildScalar(int scalar) {
|
||||
auto iter = mScalarIntMap.find(scalar);
|
||||
if (iter != mScalarIntMap.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
auto scalarIdx = buildOperand(&scalar, 4, ANEURALNETWORKS_INT32);
|
||||
mScalarIntMap.insert(std::make_pair(scalar, scalarIdx));
|
||||
return scalarIdx;
|
||||
}
|
||||
uint32_t NNAPIBackend::buildScalar(bool scalar) {
|
||||
auto iter = mScalarBoolMap.find(scalar);
|
||||
if (iter != mScalarBoolMap.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
uint8_t value = scalar;
|
||||
auto scalarIdx = buildOperand(&value, 1, ANEURALNETWORKS_BOOL);
|
||||
mScalarBoolMap.insert(std::make_pair(scalar, scalarIdx));
|
||||
return scalarIdx;
|
||||
}
|
||||
uint32_t NNAPIBackend::buildScalar(float scalar) {
|
||||
auto iter = mScalarFloatMap.find(scalar);
|
||||
if (iter != mScalarFloatMap.end()) {
|
||||
return iter->second;
|
||||
}
|
||||
auto scalarIdx = buildOperand(&scalar, 4, ANEURALNETWORKS_FLOAT32);
|
||||
mScalarFloatMap.insert(std::make_pair(scalar, scalarIdx));
|
||||
return scalarIdx;
|
||||
}
|
||||
|
||||
uint32_t NNAPIBackend::buildOperand(const void* data, size_t size, OperandCode code, std::vector<uint32_t> dims) {
|
||||
ANeuralNetworksOperandType operandType {
|
||||
.type = code,
|
||||
.dimensionCount = static_cast<uint32_t>(dims.size()),
|
||||
.dimensions = dims.empty() ? nullptr : dims.data(),
|
||||
.scale = 0.0f,
|
||||
.zeroPoint = 0,
|
||||
};
|
||||
CHECK(ANeuralNetworksModel_addOperand_27, mNNAPIModel, &operandType);
|
||||
uint32_t operandIdx = mTensorIdx++;
|
||||
#ifdef NNAPI_DEBUG
|
||||
MNN_PRINT("build operand : {\n");
|
||||
MNN_PRINT("\tidx : %d\n", operandIdx);
|
||||
MNN_PRINT("\tdata : %p\n", data);
|
||||
MNN_PRINT("\tsize : %d\n", size);
|
||||
MNN_PRINT("\ttype : %d\n", operandType.type);
|
||||
MNN_PRINT("\tdimensions : [ ");
|
||||
for (auto i : dims) MNN_PRINT("%d, ", i);
|
||||
MNN_PRINT("]\n}\n");
|
||||
#endif
|
||||
if (data && size) {
|
||||
CHECK(ANeuralNetworksModel_setOperandValue_27, mNNAPIModel, operandIdx, data, size);
|
||||
}
|
||||
return operandIdx;
|
||||
}
|
||||
|
||||
ErrorCode NNAPIBackend::buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs, const char* name) {
|
||||
#ifdef NNAPI_DEBUG
|
||||
MNN_PRINT("build operation : {\n");
|
||||
MNN_PRINT("\ttype : %d\n", op);
|
||||
MNN_PRINT("\tinputs : [ ");
|
||||
for (auto i : inputs) MNN_PRINT("%d, ", i);
|
||||
MNN_PRINT("]\n\toutputs : [ ");
|
||||
for (auto i : outputs) MNN_PRINT("%d, ", i);
|
||||
MNN_PRINT("]\n}\n");
|
||||
#endif
|
||||
if (name) mOpNames.push_back(name);
|
||||
CHECK(ANeuralNetworksModel_addOperation_27,
|
||||
mNNAPIModel, op,
|
||||
inputs.size(), inputs.data(),
|
||||
outputs.size(), outputs.data());
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
void NNAPIBackend::buildModel() {
|
||||
// set input and output of model
|
||||
std::vector<uint32_t> inputOperands(mInputTensors.size()), outputOperands(mOutputTensors.size());
|
||||
for (int i = 0; i < mInputTensors.size(); i++) {
|
||||
inputOperands[i] = getTensorIdx(mInputTensors[i]);
|
||||
}
|
||||
for (int i = 0; i < mOutputTensors.size(); i++) {
|
||||
outputOperands[i] = getTensorIdx(mOutputTensors[i]);
|
||||
}
|
||||
#ifdef NNAPI_DEBUG
|
||||
MNN_PRINT("set model's inputs & outputs : {\n");
|
||||
MNN_PRINT("\tinputs : [ ");
|
||||
for (auto i : inputOperands) MNN_PRINT("%d, ", i);
|
||||
MNN_PRINT("]\n\toutputs : [ ");
|
||||
for (auto i : outputOperands) MNN_PRINT("%d, ", i);
|
||||
MNN_PRINT("]\n}\n");
|
||||
#endif
|
||||
CHECK(ANeuralNetworksModel_identifyInputsAndOutputs_27,
|
||||
mNNAPIModel,
|
||||
inputOperands.size(),
|
||||
inputOperands.data(),
|
||||
outputOperands.size(),
|
||||
outputOperands.data());
|
||||
// segment fault
|
||||
CHECK(ANeuralNetworksModel_finish_27, mNNAPIModel);
|
||||
std::unique_ptr<bool[]> supports(new bool[mOpNames.size()]);
|
||||
int selectDeviceIdx = -1;
|
||||
for (int i = 0; i < mNNAPIDevices.size(); i++) {
|
||||
auto device = mNNAPIDevices[i].device;
|
||||
auto name = mNNAPIDevices[i].name;
|
||||
auto type = mNNAPIDevices[i].type;
|
||||
CHECK(ANeuralNetworksModel_getSupportedOperationsForDevices_29, mNNAPIModel, &device, 1, supports.get());
|
||||
MNN_PRINT("[NNAPI] device [%d : %s] supportOps = {\n", i, name);
|
||||
bool allsupport = true;
|
||||
for (int i = 0; i < mOpNames.size(); i++) {
|
||||
allsupport &= supports[i];
|
||||
MNN_PRINT("\t%s : %d\n", mOpNames[i], supports[i]);
|
||||
}
|
||||
MNN_PRINT("}\n");
|
||||
if (allsupport) {
|
||||
selectDeviceIdx = i;
|
||||
MNN_PRINT("[NNAPI] using device [%d : %s : %d].\n", i, name, type);
|
||||
break;
|
||||
}
|
||||
}
|
||||
MNN_PRINT("[NNAPI] using device [%d : %s].\n", selectDeviceIdx, mNNAPIDevices[selectDeviceIdx].name);
|
||||
CHECK(ANeuralNetworksCompilation_createForDevices_29, mNNAPIModel, &mNNAPIDevices[selectDeviceIdx].device, 1, &mNNAPICompilation);
|
||||
CHECK(ANeuralNetworksCompilation_setPreference_27, mNNAPICompilation, ANEURALNETWORKS_PREFER_SUSTAINED_SPEED);
|
||||
CHECK(ANeuralNetworksCompilation_finish_27, mNNAPICompilation);
|
||||
CHECK(ANeuralNetworksBurst_create_29, mNNAPICompilation, &mNNAPIBurst);
|
||||
}
|
||||
|
||||
void NNAPIBackend::invokeModel() const {
|
||||
// #define NNAPI_PROFILE
|
||||
ANeuralNetworksExecution *execution;
|
||||
CHECK(ANeuralNetworksExecution_create_27, mNNAPICompilation, &execution);
|
||||
#ifdef NNAPI_PROFILE
|
||||
CHECK(ANeuralNetworksExecution_setMeasureTiming, execution, true);
|
||||
#endif
|
||||
for (int i = 0; i < mInputTensors.size(); i++) {
|
||||
const void* data = mInputContentTensors[i]->host<void>();
|
||||
size_t size = mInputContentTensors[i]->size();
|
||||
CHECK(ANeuralNetworksExecution_setInput_27, execution, i, nullptr, data, size);
|
||||
}
|
||||
for (int i = 0; i < mOutputTensors.size(); i++) {
|
||||
void* data = mOutputContentTensors[i]->host<void>();
|
||||
size_t size = mOutputContentTensors[i]->size();
|
||||
CHECK(ANeuralNetworksExecution_setOutput_27, execution, i, nullptr, data, size);
|
||||
}
|
||||
#if 0
|
||||
ANeuralNetworksEvent *event = nullptr;
|
||||
CHECK(ANeuralNetworksExecution_startCompute, execution, &event);
|
||||
CHECK(ANeuralNetworksEvent_wait, event);
|
||||
ANeuralNetworksEvent_free(event);
|
||||
#else
|
||||
CHECK(ANeuralNetworksExecution_compute_29, execution);
|
||||
// CHECK(ANeuralNetworksExecution_burstCompute_29, execution, mNNAPIBurst);
|
||||
#endif
|
||||
#ifdef NNAPI_PROFILE
|
||||
uint64_t duration;
|
||||
CHECK(ANeuralNetworksExecution_getDuration, execution, ANEURALNETWORKS_DURATION_IN_DRIVER, &duration);
|
||||
if (duration != UINT64_MAX) MNN_PRINT("[NNAPI] driver time : %f ms\n", duration / 1000000.0);
|
||||
CHECK(ANeuralNetworksExecution_getDuration, execution, ANEURALNETWORKS_DURATION_ON_HARDWARE, &duration);
|
||||
if (duration != UINT64_MAX) MNN_PRINT("[NNAPI] hardware time : %f ms\n", duration / 1000000.0);
|
||||
#endif
|
||||
ANeuralNetworksExecution_free_27(execution);
|
||||
}
|
||||
|
||||
NNAPIRuntime::NNAPIRuntime(const Backend::Info& info) {
|
||||
mInfo = info;
|
||||
BackendConfig::PrecisionMode precision = BackendConfig::Precision_Normal;
|
||||
BackendConfig::PowerMode power = BackendConfig::Power_Normal;
|
||||
if (nullptr != mInfo.user) {
|
||||
precision = mInfo.user->precision;
|
||||
power = mInfo.user->power;
|
||||
}
|
||||
|
||||
mPrecision = precision;
|
||||
}
|
||||
|
||||
NNAPIRuntime::~NNAPIRuntime() {}
|
||||
|
||||
Backend* NNAPIRuntime::onCreate(const BackendConfig* config) const {
|
||||
return new NNAPIBackend(this);
|
||||
}
|
||||
|
||||
void NNAPIRuntime::onGabageCollect(int level) {
|
||||
// nothing now
|
||||
}
|
||||
NNAPIRuntime::CompilerType NNAPIRuntime::onGetCompilerType() const {
|
||||
return Compiler_Geometry;
|
||||
}
|
||||
|
||||
struct NNAPIBackendCreator : RuntimeCreator {
|
||||
|
||||
virtual Runtime* onCreate(const Backend::Info& info) const override {
|
||||
return new NNAPIRuntime(info);
|
||||
}
|
||||
|
||||
virtual bool onValid(Backend::Info& info) const override {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
void registerNNAPIRuntimeCreator() {
|
||||
if (!loadNNAPISymbol()) {
|
||||
return;
|
||||
}
|
||||
registerNNAPIOps();
|
||||
MNNInsertExtraRuntimeCreator(MNN_FORWARD_NN, new NNAPIBackendCreator, true);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
//
|
||||
// NNAPIBackend.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPIBACKEND_H
|
||||
#define MNN_NNAPIBACKEND_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <core/Backend.hpp>
|
||||
#include <core/Execution.hpp>
|
||||
#include <core/TensorUtils.hpp>
|
||||
#include "MNN_generated.h"
|
||||
#include "NNAPIDefine.hpp"
|
||||
#include "NNAPISymbol.hpp"
|
||||
|
||||
namespace MNN {
|
||||
class NNAPIRuntime : public Runtime {
|
||||
public:
|
||||
NNAPIRuntime(const Backend::Info& info);
|
||||
virtual ~NNAPIRuntime();
|
||||
virtual CompilerType onGetCompilerType() const override;
|
||||
virtual Backend* onCreate(const BackendConfig* conf) const override;
|
||||
virtual void onGabageCollect(int level) override;
|
||||
virtual std::pair<const void*, size_t> onGetCache() override {
|
||||
return std::make_pair(mCacheBuffer, mCacheSize);
|
||||
}
|
||||
|
||||
private:
|
||||
Backend::Info mInfo;
|
||||
BackendConfig::PrecisionMode mPrecision;
|
||||
const void* mCacheBuffer = nullptr;
|
||||
size_t mCacheSize = 0;
|
||||
|
||||
friend class NNAPIBackend;
|
||||
};
|
||||
|
||||
class NNAPIBackend : public Backend {
|
||||
public:
|
||||
|
||||
NNAPIBackend(const NNAPIRuntime* runtime);
|
||||
virtual ~NNAPIBackend();
|
||||
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) override;
|
||||
|
||||
virtual void onExecuteBegin() const override;
|
||||
virtual void onExecuteEnd() const override;
|
||||
|
||||
virtual Backend::MemObj* onAcquire(const Tensor* tensor, StorageType storageType) override;
|
||||
virtual bool onClearBuffer() override;
|
||||
virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
|
||||
|
||||
virtual void onResizeBegin() override;
|
||||
virtual void onResizeEnd() override;
|
||||
|
||||
public:
|
||||
class Creator {
|
||||
public:
|
||||
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
|
||||
const MNN::Op* op, Backend* backend) const = 0;
|
||||
};
|
||||
static bool addCreator(OpType t, Creator* c);
|
||||
// NNAPI functions
|
||||
bool NCHW() { return mNCHW; }
|
||||
uint32_t getTensorIdx(const Tensor* t);
|
||||
uint32_t buildScalar(int scalar);
|
||||
uint32_t buildScalar(bool scalar);
|
||||
uint32_t buildScalar(float scalar);
|
||||
uint32_t buildOperand(const void* data, size_t size, OperandCode code, std::vector<uint32_t> dims = {});
|
||||
ErrorCode buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs, const char* name = nullptr);
|
||||
void buildModel();
|
||||
void invokeModel() const;
|
||||
private:
|
||||
bool mNCHW = false;
|
||||
std::vector<std::string> mModelName;
|
||||
const NNAPIRuntime* mNPURuntime;
|
||||
BackendConfig::PrecisionMode mPrecision;
|
||||
std::vector<std::unique_ptr<Tensor>> mInputContentTensors, mOutputContentTensors;
|
||||
std::vector<const Tensor*> mInputTensors, mOutputTensors;
|
||||
// tensor idx map
|
||||
std::map<const Tensor*, uint32_t> mTensorIdxMap, mInputIdxMap, mOutputIdxMap;
|
||||
uint32_t mTensorIdx = 0;
|
||||
std::vector<const char*> mOpNames;
|
||||
// scalar idx map
|
||||
std::map<int, uint32_t> mScalarIntMap;
|
||||
std::map<bool, uint32_t> mScalarBoolMap;
|
||||
std::map<float, uint32_t> mScalarFloatMap;
|
||||
// NNAPI resource
|
||||
struct NNAPIDevice {
|
||||
ANeuralNetworksDevice* device;
|
||||
const char* name;
|
||||
int32_t type;
|
||||
};
|
||||
std::vector<NNAPIDevice> mNNAPIDevices;
|
||||
ANeuralNetworksModel *mNNAPIModel = nullptr;
|
||||
ANeuralNetworksCompilation *mNNAPICompilation = nullptr;
|
||||
ANeuralNetworksBurst* mNNAPIBurst = NULL;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
class NNAPICreatorRegister {
|
||||
public:
|
||||
NNAPICreatorRegister(OpType type) {
|
||||
T *t = new T;
|
||||
NNAPIBackend::addCreator(type, t);
|
||||
}
|
||||
~NNAPICreatorRegister() = default;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class TypedCreator : public NNAPIBackend::Creator {
|
||||
public:
|
||||
virtual ~TypedCreator() = default;
|
||||
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, const MNN::Op *op,
|
||||
Backend *backend) const override {
|
||||
auto newOp = new T(backend, op, inputs, outputs);
|
||||
return newOp;
|
||||
}
|
||||
};
|
||||
|
||||
#define REGISTER_NNAPI_OP_CREATOR(name, opType) \
|
||||
void ___##name##__##opType##__() { \
|
||||
static TypedCreator<name> _temp;\
|
||||
NNAPIBackend::addCreator(opType, &_temp); \
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif //MNN_NNAPIBACKEND_H
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
//
|
||||
// NNAPIDefine.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef NNAPIDefine_h
|
||||
#define NNAPIDefine_h
|
||||
|
||||
#ifdef MNN_NNAPI_ENABLED
|
||||
#ifdef __ANDROID__
|
||||
#include "NNAPINeuralNetworks.h"
|
||||
#define ANDROID_API_LEVEL (android_get_device_api_level())
|
||||
#else
|
||||
#undef MNN_NNAPI_ENABLED
|
||||
#define MNN_NNAPI_ENABLED 0
|
||||
#define ANDROID_API_LEVEL (0)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* NNAPIDefine_h */
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,18 @@
|
|||
// This file is generated by Shell for ops register
|
||||
namespace MNN {
|
||||
extern void ___NNAPIUnary__OpType_UnaryOp__();
|
||||
extern void ___NNAPISoftmax__OpType_Softmax__();
|
||||
extern void ___NNAPIConvolution__OpType_Convolution__();
|
||||
extern void ___NNAPIConvolution__OpType_ConvolutionDepthwise__();
|
||||
extern void ___NNAPIBinary__OpType_BinaryOp__();
|
||||
extern void ___NNAPIPool__OpType_Pooling__();
|
||||
|
||||
void registerNNAPIOps() {
|
||||
___NNAPIUnary__OpType_UnaryOp__();
|
||||
___NNAPISoftmax__OpType_Softmax__();
|
||||
___NNAPIConvolution__OpType_Convolution__();
|
||||
___NNAPIConvolution__OpType_ConvolutionDepthwise__();
|
||||
___NNAPIBinary__OpType_BinaryOp__();
|
||||
___NNAPIPool__OpType_Pooling__();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
//
|
||||
// NNAPISymbol.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/19.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPISymbol.hpp"
|
||||
#include "NNAPIDefine.hpp"
|
||||
#include <MNN/MNNDefine.h>
|
||||
#include <dlfcn.h>
|
||||
|
||||
namespace MNN {
|
||||
|
||||
#define LOAD_SYM(NAME, API_LEVEL) \
|
||||
NAME ## _ ## API_LEVEL = reinterpret_cast<decltype(NAME ## _ ## API_LEVEL)>(dlsym(lib, #NAME)); \
|
||||
if (NAME ## _ ## API_LEVEL == nullptr) { \
|
||||
MNN_PRINT("[NNAPI] Load symbol %s failed.", #NAME); \
|
||||
return false; \
|
||||
}
|
||||
|
||||
bool loadNNAPISymbol() {
|
||||
if (ANDROID_API_LEVEL < 29) {
|
||||
return false;
|
||||
}
|
||||
void *lib = dlopen("libneuralnetworks.so", RTLD_NOW | RTLD_LOCAL);
|
||||
if (lib == nullptr) {
|
||||
return false;
|
||||
}
|
||||
LOAD_SYM(ANeuralNetworksModel_getSupportedOperationsForDevices, 29);
|
||||
LOAD_SYM(ANeuralNetworks_getDeviceCount, 29);
|
||||
LOAD_SYM(ANeuralNetworks_getDevice, 29);
|
||||
LOAD_SYM(ANeuralNetworksDevice_getName, 29);
|
||||
LOAD_SYM(ANeuralNetworksDevice_getType, 29);
|
||||
LOAD_SYM(ANeuralNetworksCompilation_createForDevices, 29);
|
||||
LOAD_SYM(ANeuralNetworksExecution_compute, 29);
|
||||
LOAD_SYM(ANeuralNetworksBurst_create, 29);
|
||||
LOAD_SYM(ANeuralNetworksBurst_free, 29);
|
||||
LOAD_SYM(ANeuralNetworksExecution_burstCompute, 29);
|
||||
LOAD_SYM(ANeuralNetworksModel_create, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_free, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_finish, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_addOperand, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_setOperandValue, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams, 29);
|
||||
LOAD_SYM(ANeuralNetworksModel_addOperation, 27);
|
||||
LOAD_SYM(ANeuralNetworksModel_identifyInputsAndOutputs, 27);
|
||||
LOAD_SYM(ANeuralNetworksCompilation_create, 27);
|
||||
LOAD_SYM(ANeuralNetworksCompilation_free, 27);
|
||||
LOAD_SYM(ANeuralNetworksCompilation_setPreference, 27);
|
||||
LOAD_SYM(ANeuralNetworksCompilation_finish, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_create, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_free, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_setInput, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_setInputFromMemory, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_setOutput, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_setOutputFromMemory, 27);
|
||||
LOAD_SYM(ANeuralNetworksExecution_startCompute, 27);
|
||||
LOAD_SYM(ANeuralNetworksEvent_wait, 27);
|
||||
LOAD_SYM(ANeuralNetworksEvent_free, 27);
|
||||
LOAD_SYM(ANeuralNetworksDevice_getVersion, 29);
|
||||
LOAD_SYM(ANeuralNetworksMemory_createFromAHardwareBuffer, 29);
|
||||
LOAD_SYM(ANeuralNetworksMemory_createFromFd, 27);
|
||||
LOAD_SYM(ANeuralNetworksMemory_free, 27);
|
||||
return true;
|
||||
}
|
||||
MNN_ANeuralNetworksModel_getSupportedOperationsForDevices *ANeuralNetworksModel_getSupportedOperationsForDevices_29 = nullptr;
|
||||
MNN_ANeuralNetworks_getDeviceCount *ANeuralNetworks_getDeviceCount_29 = nullptr;
|
||||
MNN_ANeuralNetworks_getDevice *ANeuralNetworks_getDevice_29 = nullptr;
|
||||
MNN_ANeuralNetworksDevice_getName *ANeuralNetworksDevice_getName_29 = nullptr;
|
||||
MNN_ANeuralNetworksDevice_getType *ANeuralNetworksDevice_getType_29 = nullptr;
|
||||
MNN_ANeuralNetworksCompilation_createForDevices *ANeuralNetworksCompilation_createForDevices_29 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_compute *ANeuralNetworksExecution_compute_29 = nullptr;
|
||||
MNN_ANeuralNetworksBurst_create *ANeuralNetworksBurst_create_29 = nullptr;
|
||||
MNN_ANeuralNetworksBurst_free *ANeuralNetworksBurst_free_29 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_burstCompute *ANeuralNetworksExecution_burstCompute_29 = nullptr;
|
||||
MNN_ANeuralNetworksModel_create *ANeuralNetworksModel_create_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_finish *ANeuralNetworksModel_finish_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_free *ANeuralNetworksModel_free_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_addOperand *ANeuralNetworksModel_addOperand_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_setOperandValue *ANeuralNetworksModel_setOperandValue_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams *ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_29;
|
||||
MNN_ANeuralNetworksModel_addOperation *ANeuralNetworksModel_addOperation_27 = nullptr;
|
||||
MNN_ANeuralNetworksModel_identifyInputsAndOutputs *ANeuralNetworksModel_identifyInputsAndOutputs_27 = nullptr;
|
||||
MNN_ANeuralNetworksCompilation_create *ANeuralNetworksCompilation_create_27 = nullptr;
|
||||
MNN_ANeuralNetworksCompilation_free *ANeuralNetworksCompilation_free_27 = nullptr;
|
||||
MNN_ANeuralNetworksCompilation_setPreference *ANeuralNetworksCompilation_setPreference_27 = nullptr;
|
||||
MNN_ANeuralNetworksCompilation_finish *ANeuralNetworksCompilation_finish_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_create *ANeuralNetworksExecution_create_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_free *ANeuralNetworksExecution_free_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_setInput *ANeuralNetworksExecution_setInput_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_setInputFromMemory *ANeuralNetworksExecution_setInputFromMemory_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_setOutput *ANeuralNetworksExecution_setOutput_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_setOutputFromMemory *ANeuralNetworksExecution_setOutputFromMemory_27 = nullptr;
|
||||
MNN_ANeuralNetworksExecution_startCompute *ANeuralNetworksExecution_startCompute_27 = nullptr;
|
||||
MNN_ANeuralNetworksEvent_wait *ANeuralNetworksEvent_wait_27 = nullptr;
|
||||
MNN_ANeuralNetworksEvent_free *ANeuralNetworksEvent_free_27 = nullptr;
|
||||
MNN_ANeuralNetworksDevice_getVersion *ANeuralNetworksDevice_getVersion_29 = nullptr;
|
||||
MNN_ANeuralNetworksMemory_createFromAHardwareBuffer *ANeuralNetworksMemory_createFromAHardwareBuffer_29 = nullptr;
|
||||
MNN_ANeuralNetworksMemory_createFromFd *ANeuralNetworksMemory_createFromFd_27 = nullptr;
|
||||
MNN_ANeuralNetworksMemory_free *ANeuralNetworksMemory_free_27 = nullptr;
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
//
|
||||
// NNAPISymbol.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/19.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef NNAPISymbols_h
|
||||
#define NNAPISymbols_h
|
||||
#include "NNAPIDefine.hpp"
|
||||
|
||||
namespace MNN {
|
||||
// typedef the function in nnapi will be used
|
||||
typedef int (MNN_ANeuralNetworksModel_getSupportedOperationsForDevices)(const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps);
|
||||
typedef int (MNN_ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
|
||||
typedef int (MNN_ANeuralNetworks_getDevice)(uint32_t devIndex, ANeuralNetworksDevice** device);
|
||||
typedef int (MNN_ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device, const char** name);
|
||||
typedef int (MNN_ANeuralNetworksDevice_getType)(const ANeuralNetworksDevice* device, int32_t* type);
|
||||
typedef int (MNN_ANeuralNetworksCompilation_createForDevices)(ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation);
|
||||
typedef int (MNN_ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
|
||||
typedef int (MNN_ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation, ANeuralNetworksBurst** burst);
|
||||
typedef void (MNN_ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
|
||||
typedef int (MNN_ANeuralNetworksExecution_burstCompute)(ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
|
||||
typedef int (MNN_ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
|
||||
typedef void (MNN_ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
|
||||
typedef int (MNN_ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
|
||||
typedef int (MNN_ANeuralNetworksModel_addOperand)(ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
|
||||
typedef int (MNN_ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length);
|
||||
typedef int (MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
|
||||
typedef int (MNN_ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
|
||||
typedef int (MNN_ANeuralNetworksModel_identifyInputsAndOutputs)(ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs);
|
||||
typedef int (MNN_ANeuralNetworksCompilation_create)(ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
|
||||
typedef void (MNN_ANeuralNetworksCompilation_free)(ANeuralNetworksCompilation* compilation);
|
||||
typedef int (MNN_ANeuralNetworksCompilation_setPreference)(ANeuralNetworksCompilation* compilation, int32_t preference);
|
||||
typedef int (MNN_ANeuralNetworksCompilation_finish)(ANeuralNetworksCompilation* compilation);
|
||||
typedef int (MNN_ANeuralNetworksExecution_create)(ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution);
|
||||
typedef void (MNN_ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
|
||||
typedef int (MNN_ANeuralNetworksExecution_setInput)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
|
||||
typedef int (MNN_ANeuralNetworksExecution_setInputFromMemory)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
|
||||
typedef int (MNN_ANeuralNetworksExecution_setOutput)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length);
|
||||
typedef int (MNN_ANeuralNetworksExecution_setOutputFromMemory)(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length);
|
||||
typedef int (MNN_ANeuralNetworksExecution_startCompute)(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
|
||||
typedef int (MNN_ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
|
||||
typedef void (MNN_ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
|
||||
typedef int (MNN_ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device, const char** version);
|
||||
typedef int (MNN_ANeuralNetworksMemory_createFromAHardwareBuffer)(const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
|
||||
typedef int (MNN_ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory **memory);
|
||||
typedef void (MNN_ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
|
||||
|
||||
// symbols
|
||||
bool loadNNAPISymbol();
|
||||
extern MNN_ANeuralNetworksModel_getSupportedOperationsForDevices *ANeuralNetworksModel_getSupportedOperationsForDevices_29;
|
||||
extern MNN_ANeuralNetworks_getDeviceCount *ANeuralNetworks_getDeviceCount_29;
|
||||
extern MNN_ANeuralNetworks_getDevice *ANeuralNetworks_getDevice_29;
|
||||
extern MNN_ANeuralNetworksDevice_getName *ANeuralNetworksDevice_getName_29;
|
||||
extern MNN_ANeuralNetworksDevice_getType *ANeuralNetworksDevice_getType_29;
|
||||
extern MNN_ANeuralNetworksCompilation_createForDevices *ANeuralNetworksCompilation_createForDevices_29;
|
||||
extern MNN_ANeuralNetworksExecution_compute *ANeuralNetworksExecution_compute_29;
|
||||
extern MNN_ANeuralNetworksBurst_create *ANeuralNetworksBurst_create_29;
|
||||
extern MNN_ANeuralNetworksBurst_free *ANeuralNetworksBurst_free_29;
|
||||
extern MNN_ANeuralNetworksExecution_burstCompute *ANeuralNetworksExecution_burstCompute_29;
|
||||
extern MNN_ANeuralNetworksModel_create *ANeuralNetworksModel_create_27;
|
||||
extern MNN_ANeuralNetworksModel_free *ANeuralNetworksModel_free_27;
|
||||
extern MNN_ANeuralNetworksModel_finish *ANeuralNetworksModel_finish_27;
|
||||
extern MNN_ANeuralNetworksModel_addOperand *ANeuralNetworksModel_addOperand_27;
|
||||
extern MNN_ANeuralNetworksModel_setOperandValue *ANeuralNetworksModel_setOperandValue_27;
|
||||
extern MNN_ANeuralNetworksModel_setOperandSymmPerChannelQuantParams *ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_29;
|
||||
extern MNN_ANeuralNetworksModel_addOperation *ANeuralNetworksModel_addOperation_27;
|
||||
extern MNN_ANeuralNetworksModel_identifyInputsAndOutputs *ANeuralNetworksModel_identifyInputsAndOutputs_27;
|
||||
extern MNN_ANeuralNetworksCompilation_create *ANeuralNetworksCompilation_create_27;
|
||||
extern MNN_ANeuralNetworksCompilation_free *ANeuralNetworksCompilation_free_27;
|
||||
extern MNN_ANeuralNetworksCompilation_setPreference *ANeuralNetworksCompilation_setPreference_27;
|
||||
extern MNN_ANeuralNetworksCompilation_finish *ANeuralNetworksCompilation_finish_27;
|
||||
extern MNN_ANeuralNetworksExecution_create *ANeuralNetworksExecution_create_27;
|
||||
extern MNN_ANeuralNetworksExecution_free *ANeuralNetworksExecution_free_27;
|
||||
extern MNN_ANeuralNetworksExecution_setInput *ANeuralNetworksExecution_setInput_27;
|
||||
extern MNN_ANeuralNetworksExecution_setInputFromMemory *ANeuralNetworksExecution_setInputFromMemory_27;
|
||||
extern MNN_ANeuralNetworksExecution_setOutput *ANeuralNetworksExecution_setOutput_27;
|
||||
extern MNN_ANeuralNetworksExecution_setOutputFromMemory *ANeuralNetworksExecution_setOutputFromMemory_27;
|
||||
extern MNN_ANeuralNetworksExecution_startCompute *ANeuralNetworksExecution_startCompute_27;
|
||||
extern MNN_ANeuralNetworksEvent_wait *ANeuralNetworksEvent_wait_27;
|
||||
extern MNN_ANeuralNetworksEvent_free *ANeuralNetworksEvent_free_27;
|
||||
extern MNN_ANeuralNetworksDevice_getVersion *ANeuralNetworksDevice_getVersion_29;
|
||||
extern MNN_ANeuralNetworksMemory_createFromAHardwareBuffer *ANeuralNetworksMemory_createFromAHardwareBuffer_29;
|
||||
extern MNN_ANeuralNetworksMemory_createFromFd *ANeuralNetworksMemory_createFromFd_27;
|
||||
extern MNN_ANeuralNetworksMemory_free *ANeuralNetworksMemory_free_27;
|
||||
}
|
||||
#endif /* NNAPISymbols_h */
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
//
|
||||
// NNAPIBinary.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPIBinary.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
||||
NNAPIBinary::NNAPIBinary(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
|
||||
}
|
||||
|
||||
ErrorCode NNAPIBinary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
MNN_ASSERT(inputs.size() == 2 && outputs.size() == 1);
|
||||
std::map<BinaryOpOperation, int> binary_map {
|
||||
{BinaryOpOperation_ADD, ANEURALNETWORKS_ADD},
|
||||
{BinaryOpOperation_SUB, ANEURALNETWORKS_SUB},
|
||||
{BinaryOpOperation_MUL, ANEURALNETWORKS_MUL},
|
||||
{BinaryOpOperation_DIV, ANEURALNETWORKS_DIV}
|
||||
};
|
||||
auto opType = static_cast<BinaryOpOperation>(mOp->main_as_BinaryOp()->opType());
|
||||
auto iter = binary_map.find(opType);
|
||||
if (iter == binary_map.end() || iter->second < 0) {
|
||||
MNN_ERROR("[NNAPI] Binary not support %s\n", MNN::EnumNameBinaryOpOperation(opType));
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
auto inputIdxs = getTensorIdxs(inputs);
|
||||
inputIdxs.push_back(buildScalar(ANEURALNETWORKS_FUSED_NONE));
|
||||
return buildOperation(iter->second, inputIdxs, getTensorIdxs(outputs));
|
||||
}
|
||||
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPIBinary, OpType_BinaryOp)
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
//
|
||||
// NNAPIBinary.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPIBINARY_HPP
|
||||
#define MNN_NNAPIBINARY_HPP
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPIBinary : public NNAPICommonExecution {
|
||||
public:
|
||||
NNAPIBinary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NNAPIBinary() = default;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_NNAPIBINARY_HPP
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
//
|
||||
// NNAPICommonExecution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
namespace MNN {
|
||||
|
||||
NNAPICommonExecution::NNAPICommonExecution(Backend *backend, const Op *op) : Execution(backend), mOp(op) {
|
||||
mNNAPIBackend = (NNAPIBackend*)backend;
|
||||
mNCHW = mNNAPIBackend->NCHW();
|
||||
}
|
||||
|
||||
ErrorCode NNAPICommonExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
ErrorCode NNAPICommonExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
return NO_ERROR;
|
||||
}
|
||||
|
||||
std::vector<uint32_t> NNAPICommonExecution::getTensorIdxs(const std::vector<Tensor*>& tensors) {
|
||||
std::vector<uint32_t> idxs(tensors.size());
|
||||
for (int i = 0; i < tensors.size(); i++) {
|
||||
idxs[i] = mNNAPIBackend->getTensorIdx(tensors[i]);
|
||||
}
|
||||
return idxs;
|
||||
}
|
||||
|
||||
uint32_t NNAPICommonExecution::buildConstant(const void* data, size_t size, OperandCode dtype, std::vector<uint32_t> dims) {
|
||||
return mNNAPIBackend->buildOperand(data, size, dtype, dims);
|
||||
}
|
||||
|
||||
uint32_t NNAPICommonExecution::buildTensor(OperandCode dtype, std::vector<int> dims) {
|
||||
std::vector<uint32_t> udims(dims.begin(), dims.end());
|
||||
if (!mNCHW) {
|
||||
// NCHW -> NHWC
|
||||
udims[0] = dims[0];
|
||||
udims[1] = dims[2];
|
||||
udims[2] = dims[3];
|
||||
udims[3] = dims[1];
|
||||
}
|
||||
return mNNAPIBackend->buildOperand(nullptr, 0, dtype, udims);
|
||||
}
|
||||
|
||||
ErrorCode NNAPICommonExecution::buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs) {
|
||||
auto name = mOp->name() ? mOp->name()->c_str() : EnumNameOpType(mOp->type());
|
||||
return mNNAPIBackend->buildOperation(op, inputs, outputs, name);
|
||||
}
|
||||
}; // namespace MNN
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
//
|
||||
// NNAPICommonExecution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPICOMMONEXECUTION_HPP
|
||||
#define MNN_NNAPICOMMONEXECUTION_HPP
|
||||
#include "core/Execution.hpp"
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include <memory>
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPICommonExecution : public Execution {
|
||||
public:
|
||||
NNAPICommonExecution(Backend *backend, const Op *op);
|
||||
virtual ~NNAPICommonExecution() = default;
|
||||
|
||||
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
|
||||
protected:
|
||||
bool mNCHW;
|
||||
std::vector<uint32_t> getTensorIdxs(const std::vector<Tensor*>& tensors);
|
||||
template <typename T> inline uint32_t buildScalar(T scalar) { return mNNAPIBackend->buildScalar(scalar); }
|
||||
uint32_t buildConstant(const void* data, size_t size, OperandCode dtype, std::vector<uint32_t> dims = {});
|
||||
uint32_t buildTensor(OperandCode dtype, std::vector<int> dims);
|
||||
ErrorCode buildOperation(int op, const std::vector<uint32_t> &inputs, const std::vector<uint32_t> &outputs);
|
||||
NNAPIBackend* mNNAPIBackend;
|
||||
const Op* mOp;
|
||||
};
|
||||
|
||||
} // namespace MNN
|
||||
#endif // MNN_NNAPICOMMONEXECUTION_HPP
|
||||
|
|
@ -0,0 +1,138 @@
|
|||
//
|
||||
// NNAPIConvolution.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPIConvolution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
||||
NNAPIConvolution::NNAPIConvolution(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
|
||||
isDepthwise = mOp->type() == OpType_ConvolutionDepthwise;
|
||||
isDeconv = mOp->type() == OpType_Deconvolution;
|
||||
}
|
||||
template<typename T>
|
||||
static void NCHW2NHWC(const T* source, T* dest, int b, int c, int area) {
|
||||
int sourceBatchsize = c * area;
|
||||
int destBatchSize = sourceBatchsize;
|
||||
for (int bi = 0; bi < b; ++bi) {
|
||||
auto srcBatch = source + bi * sourceBatchsize;
|
||||
auto dstBatch = dest + bi * destBatchSize;
|
||||
for (int i = 0; i < area; ++i) {
|
||||
auto srcArea = srcBatch + i;
|
||||
auto dstArea = dstBatch + i * c;
|
||||
for (int ci = 0; ci < c; ++ci) {
|
||||
dstArea[ci] = srcArea[ci * area];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ErrorCode NNAPIConvolution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto conv2D = mOp->main_as_Convolution2D();
|
||||
auto common = conv2D->common();
|
||||
int kernelX = common->kernelX();
|
||||
int kernelY = common->kernelY();
|
||||
int strideX = common->strideX();
|
||||
int strideY = common->strideY();
|
||||
int dilateX = common->dilateX();
|
||||
int dilateY = common->dilateY();
|
||||
int group = common->group();
|
||||
uint32_t outputCount = common->outputCount();
|
||||
auto padMod = common->padMode();
|
||||
bool relu = common->relu();
|
||||
bool relu6 = common->relu6();
|
||||
int top, left, bottom, right;
|
||||
if (nullptr != common->pads()) {
|
||||
MNN_ASSERT(common->pads()->size() >= 4);
|
||||
top = common->pads()->Get(0);
|
||||
left = common->pads()->Get(1);
|
||||
bottom = common->pads()->Get(2);
|
||||
right = common->pads()->Get(3);
|
||||
} else {
|
||||
top = common->padY();
|
||||
left = common->padX();
|
||||
bottom = common->padY();
|
||||
right = common->padX();
|
||||
}
|
||||
// NNAPI inputs:
|
||||
// conv2d: [input, weight, bias, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, fusecode, NCHW/NHWC, dilate_w, dilate_h]
|
||||
// depthwise_conv2d: [input, weight, bias, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, multiplier, fusecode, NCHW/NHWC, dilate_w, dilate_h]
|
||||
auto inputIdxs = getTensorIdxs(inputs);
|
||||
// inputs not contain weight and bias, read from param
|
||||
if (inputs.size() < 3) {
|
||||
const void *weightPtr, *biasPtr;
|
||||
int weightSize, biasSize;
|
||||
if (nullptr != conv2D->quanParameter()) {
|
||||
quanCommon = ConvolutionCommon::load(conv2D->quanParameter(), true);
|
||||
if (nullptr == quanCommon) {
|
||||
MNN_ERROR("Memory not Enough, can't extract IDST Convolution: %s \n", mOp->name()->c_str());
|
||||
}
|
||||
if (quanCommon->weightFloat.get() == nullptr) {
|
||||
MNN_PRINT("quanCommon->weightFloat.get() == nullptr \n");
|
||||
}
|
||||
// Back to float
|
||||
weightPtr = quanCommon->weightFloat.get();
|
||||
weightSize = quanCommon->weightFloat.size();
|
||||
} else {
|
||||
weightPtr = conv2D->weight()->data();
|
||||
weightSize = conv2D->weight()->size();
|
||||
}
|
||||
biasSize = conv2D->bias()->size();
|
||||
biasPtr = conv2D->bias()->data();
|
||||
uint32_t inputCount = weightSize / (kernelX * kernelY * outputCount);
|
||||
uint32_t n = outputCount;
|
||||
uint32_t c = inputCount;
|
||||
uint32_t h = kernelY;
|
||||
uint32_t w = kernelX;
|
||||
if (isDepthwise) {
|
||||
n = 1;
|
||||
c = outputCount;
|
||||
}
|
||||
nhwcWeight.reset(new float[weightSize]);
|
||||
std::vector<uint32_t> weightDims {n, h, w, c};
|
||||
// [outputCount, inputChannel, h, w] -> [outputCount, h, w, inputChannel]
|
||||
NCHW2NHWC<float>(reinterpret_cast<const float*>(weightPtr), nhwcWeight.get(), n, c, h * w);
|
||||
std::vector<uint32_t> biasDims {outputCount};
|
||||
inputIdxs.push_back(buildConstant(nhwcWeight.get(), weightSize * sizeof(float), ANEURALNETWORKS_TENSOR_FLOAT32, weightDims));
|
||||
inputIdxs.push_back(buildConstant(biasPtr, biasSize * sizeof(float), ANEURALNETWORKS_TENSOR_FLOAT32, biasDims));
|
||||
}
|
||||
// pad
|
||||
inputIdxs.push_back(buildScalar(left));
|
||||
inputIdxs.push_back(buildScalar(right));
|
||||
inputIdxs.push_back(buildScalar(top));
|
||||
inputIdxs.push_back(buildScalar(bottom));
|
||||
// stride
|
||||
inputIdxs.push_back(buildScalar(strideX));
|
||||
inputIdxs.push_back(buildScalar(strideY));
|
||||
if (isDepthwise) {
|
||||
int multiplier = outputCount / group;
|
||||
inputIdxs.push_back(buildScalar(multiplier));
|
||||
}
|
||||
// fusecode
|
||||
FuseCode code = ANEURALNETWORKS_FUSED_NONE;
|
||||
if (relu) code = ANEURALNETWORKS_FUSED_RELU;
|
||||
if (relu6) code = ANEURALNETWORKS_FUSED_RELU6;
|
||||
inputIdxs.push_back(buildScalar(code));
|
||||
// NCHW/NHWC
|
||||
inputIdxs.push_back(buildScalar(mNCHW));
|
||||
// dilate
|
||||
if (dilateX > 1 || dilateY > 1) {
|
||||
inputIdxs.push_back(buildScalar(dilateX));
|
||||
inputIdxs.push_back(buildScalar(dilateY));
|
||||
}
|
||||
auto op = ANEURALNETWORKS_CONV_2D;
|
||||
if (mOp->type() == OpType_ConvolutionDepthwise) {
|
||||
op = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
|
||||
} else {
|
||||
// TODO: deconv
|
||||
}
|
||||
return buildOperation(op, inputIdxs, getTensorIdxs(outputs));
|
||||
}
|
||||
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPIConvolution, OpType_Convolution)
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPIConvolution, OpType_ConvolutionDepthwise)
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
//
|
||||
// NNAPIConvolution.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPICONVOLUTION_HPP
|
||||
#define MNN_NNAPICONVOLUTION_HPP
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
#include "core/ConvolutionCommon.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPIConvolution : public NNAPICommonExecution {
|
||||
public:
|
||||
NNAPIConvolution(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NNAPIConvolution() = default;
|
||||
private:
|
||||
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
|
||||
std::unique_ptr<float[]> nhwcWeight;
|
||||
bool isDepthwise = false, isDeconv = false;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_NNAPICONVOLUTION_HPP
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
//
|
||||
// NNAPIPool.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPIPool.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
||||
NNAPIPool::NNAPIPool(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
|
||||
}
|
||||
|
||||
ErrorCode NNAPIPool::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
auto pool = mOp->main_as_Pool();
|
||||
auto strideX = pool->strideX();
|
||||
auto strideY = pool->strideY();
|
||||
auto kernelX = pool->kernelX();
|
||||
auto kernelY = pool->kernelY();
|
||||
auto padMod = pool->padType();
|
||||
auto global = pool->isGlobal();
|
||||
int top, left, bottom, right;
|
||||
if (nullptr != pool->pads()) {
|
||||
MNN_ASSERT(pool->pads()->size() >= 4);
|
||||
top = pool->pads()->Get(0);
|
||||
left = pool->pads()->Get(1);
|
||||
bottom = pool->pads()->Get(2);
|
||||
right = pool->pads()->Get(3);
|
||||
} else {
|
||||
top = pool->padY();
|
||||
left = pool->padX();
|
||||
bottom = pool->padY();
|
||||
right = pool->padX();
|
||||
}
|
||||
if (global) {
|
||||
kernelX = inputs[0]->width();
|
||||
kernelY = inputs[0]->height();
|
||||
}
|
||||
// NNAPI Pool inputs: [input, pad_left, pad_right, pad_top, pad_bottom, stride_w, stride_h, kernel_w, kernel_h, fusecode, NCHW/NHWC]
|
||||
auto inputIdxs = getTensorIdxs(inputs);
|
||||
// pad
|
||||
inputIdxs.push_back(buildScalar(left));
|
||||
inputIdxs.push_back(buildScalar(right));
|
||||
inputIdxs.push_back(buildScalar(top));
|
||||
inputIdxs.push_back(buildScalar(bottom));
|
||||
// stride
|
||||
inputIdxs.push_back(buildScalar(strideX));
|
||||
inputIdxs.push_back(buildScalar(strideY));
|
||||
// kernel
|
||||
inputIdxs.push_back(buildScalar(kernelX));
|
||||
inputIdxs.push_back(buildScalar(kernelY));
|
||||
// fusecode
|
||||
inputIdxs.push_back(buildScalar(ANEURALNETWORKS_FUSED_NONE));
|
||||
// NCHW/NHWC
|
||||
inputIdxs.push_back(buildScalar(mNCHW));
|
||||
auto op = ANEURALNETWORKS_MAX_POOL_2D;
|
||||
if (pool->type() == PoolType_AVEPOOL) {
|
||||
op = ANEURALNETWORKS_AVERAGE_POOL_2D;
|
||||
}
|
||||
return buildOperation(op, inputIdxs, getTensorIdxs(outputs));
|
||||
}
|
||||
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPIPool, OpType_Pooling)
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
//
|
||||
// NNAPIPool.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPIPOOL_HPP
|
||||
#define MNN_NNAPIPOOL_HPP
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
#include "core/ConvolutionCommon.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPIPool : public NNAPICommonExecution {
|
||||
public:
|
||||
NNAPIPool(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NNAPIPool() = default;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_NNAPIPOOL_HPP
|
||||
|
|
@ -0,0 +1,38 @@
|
|||
//
|
||||
// NNAPISoftmax.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPISoftmax.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
||||
NNAPISoftmax::NNAPISoftmax(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
|
||||
}
|
||||
|
||||
ErrorCode NNAPISoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
int axis = mOp->main_as_Axis()->axis();
|
||||
auto inputIdxs = getTensorIdxs(inputs);
|
||||
// NNAPI Softmax inputs: [input, beta, axis]
|
||||
float beta = 1.0;
|
||||
inputIdxs.push_back(buildScalar(beta));
|
||||
bool needAxis = false;
|
||||
auto dims = inputs[0]->shape();
|
||||
for (int i = 0; i < dims.size(); i++) {
|
||||
if (i != axis && dims[i] > 1) {
|
||||
needAxis = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (needAxis) {
|
||||
inputIdxs.push_back(buildScalar(axis));
|
||||
}
|
||||
return buildOperation(ANEURALNETWORKS_SOFTMAX, inputIdxs, getTensorIdxs(outputs));
|
||||
}
|
||||
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPISoftmax, OpType_Softmax)
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
//
|
||||
// NNAPISoftmax.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/06.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPISOFTMAX_HPP
|
||||
#define MNN_NNAPISOFTMAX_HPP
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
#include "core/ConvolutionCommon.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPISoftmax : public NNAPICommonExecution {
|
||||
public:
|
||||
NNAPISoftmax(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NNAPISoftmax() = default;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_NNAPISOFTMAX_HPP
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
//
|
||||
// NNAPIUnary.cpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#include "NNAPIUnary.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
|
||||
NNAPIUnary::NNAPIUnary(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : NNAPICommonExecution(b, op) {
|
||||
}
|
||||
|
||||
ErrorCode NNAPIUnary::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
|
||||
MNN_ASSERT(inputs.size() == 1 && outputs.size() == 1);
|
||||
std::map<UnaryOpOperation, int> unary_map {
|
||||
{UnaryOpOperation_ABS, ANEURALNETWORKS_ABS},
|
||||
{UnaryOpOperation_EXP, ANEURALNETWORKS_EXP},
|
||||
{UnaryOpOperation_SQRT, ANEURALNETWORKS_SQRT},
|
||||
{UnaryOpOperation_RSQRT, ANEURALNETWORKS_RSQRT},
|
||||
{UnaryOpOperation_LOG, ANEURALNETWORKS_LOG},
|
||||
{UnaryOpOperation_RECIPROCAL, -1},
|
||||
{UnaryOpOperation_SIN, ANEURALNETWORKS_SIN},
|
||||
{UnaryOpOperation_ASIN, -1},
|
||||
{UnaryOpOperation_SINH, -1},
|
||||
{UnaryOpOperation_ASINH, -1},
|
||||
{UnaryOpOperation_COS, -1},
|
||||
{UnaryOpOperation_ACOS, -1},
|
||||
{UnaryOpOperation_COSH, -1},
|
||||
{UnaryOpOperation_ACOSH, -1},
|
||||
{UnaryOpOperation_TAN, -1},
|
||||
{UnaryOpOperation_ATAN, -1},
|
||||
{UnaryOpOperation_TANH, ANEURALNETWORKS_TANH},
|
||||
{UnaryOpOperation_ATANH, -1},
|
||||
{UnaryOpOperation_ERF, -1},
|
||||
{UnaryOpOperation_CEIL, -1},
|
||||
{UnaryOpOperation_FLOOR, ANEURALNETWORKS_FLOOR},
|
||||
{UnaryOpOperation_ROUND, -1},
|
||||
{UnaryOpOperation_SIGN, -1},
|
||||
{UnaryOpOperation_SIGMOID, -1},
|
||||
{UnaryOpOperation_LOG1P, -1},
|
||||
{UnaryOpOperation_SQUARE, -1},
|
||||
{UnaryOpOperation_NEG, ANEURALNETWORKS_NEG},
|
||||
{UnaryOpOperation_HARDSWISH, -1},
|
||||
{UnaryOpOperation_GELU, -1},
|
||||
{UnaryOpOperation_GELU_STANDARD, -1},
|
||||
{UnaryOpOperation_EXPM1, -1},
|
||||
{UnaryOpOperation_ERFC, -1},
|
||||
{UnaryOpOperation_BNLL, -1},
|
||||
{UnaryOpOperation_ERFINV, -1}
|
||||
};
|
||||
auto opType = mOp->main_as_UnaryOp()->opType();
|
||||
auto iter = unary_map.find(opType);
|
||||
if (iter == unary_map.end() || iter->second < 0) {
|
||||
MNN_ERROR("NNAPI Unary not support %s\n", MNN::EnumNameUnaryOpOperation(opType));
|
||||
return NOT_SUPPORT;
|
||||
}
|
||||
return buildOperation(iter->second, getTensorIdxs(inputs), getTensorIdxs(outputs));
|
||||
}
|
||||
|
||||
REGISTER_NNAPI_OP_CREATOR(NNAPIUnary, OpType_UnaryOp)
|
||||
} // namespace MNN
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
//
|
||||
// NNAPIUnary.hpp
|
||||
// MNN
|
||||
//
|
||||
// Created by MNN on 2022/09/05.
|
||||
// Copyright © 2018, Alibaba Group Holding Limited
|
||||
//
|
||||
|
||||
#ifndef MNN_NNAPIUNARY_HPP
|
||||
#define MNN_NNAPIUNARY_HPP
|
||||
|
||||
#include "NNAPIBackend.hpp"
|
||||
#include "NNAPICommonExecution.hpp"
|
||||
|
||||
namespace MNN {
|
||||
|
||||
class NNAPIUnary : public NNAPICommonExecution {
|
||||
public:
|
||||
NNAPIUnary(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
|
||||
virtual ~NNAPIUnary() = default;
|
||||
};
|
||||
} // namespace MNN
|
||||
|
||||
#endif // MNN_NNAPIUNARY_HPP
|
||||
|
|
@ -43,6 +43,10 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
int shape[4] = {outputShape[0], outputShape[1], outputShape[2], UP_DIV(outputShape[3], 4)};
|
||||
int fullCount[2] = {1, 1};
|
||||
|
||||
int activationType = 0;
|
||||
if(mOp->type() == OpType_BinaryOp) {
|
||||
activationType = mOp->main_as_BinaryOp()->activationType();
|
||||
}
|
||||
auto &unit = mUnits[0];
|
||||
unit.kernel = runTime->buildKernel("binary_buf", "binary_buf", mBuildOptions);
|
||||
mMaxWorkGroupSize = static_cast<uint32_t>(runTime->getMaxWorkGroupSize(unit.kernel));
|
||||
|
|
@ -60,6 +64,7 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
unit.kernel.setArg(index++, openCLBuffer(output));
|
||||
unit.kernel.setArg(index++, shape);
|
||||
unit.kernel.setArg(index++, fullCount);
|
||||
unit.kernel.setArg(index++, activationType);
|
||||
|
||||
std::string name = "binary_buf";
|
||||
mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
|
||||
|
|
@ -80,6 +85,7 @@ ErrorCode BinaryBufExecution::onResize(const std::vector<Tensor *> &inputs, cons
|
|||
unit.kernel.setArg(index++, openCLBuffer(output));
|
||||
unit.kernel.setArg(index++, shape);
|
||||
unit.kernel.setArg(index++, fullCount);
|
||||
unit.kernel.setArg(index++, activationType);
|
||||
|
||||
unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
|
||||
unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
|
||||
|
|
|
|||
|
|
@ -7,7 +7,8 @@ __kernel void binary(__private int global_dim0, __private int global_dim1,
|
|||
__read_only image2d_t input0, __read_only image2d_t input1,
|
||||
__write_only image2d_t output,
|
||||
__private const int4 shape,//[N,H,W,C4]
|
||||
__private const int2 isFull) {
|
||||
__private const int2 isFull,
|
||||
__private const int activationType) {
|
||||
int2 pos = (int2)(get_global_id(0), get_global_id(1));//WC4, NH
|
||||
|
||||
FLOAT4 in0, in1;
|
||||
|
|
@ -27,6 +28,9 @@ __kernel void binary(__private int global_dim0, __private int global_dim1,
|
|||
}
|
||||
|
||||
FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
|
||||
if(activationType == 1) {
|
||||
out = fmax(out, (FLOAT4)0);
|
||||
}
|
||||
WI_F(output, pos, out);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,7 +5,8 @@
|
|||
__kernel void binary_buf(__private int global_dim0, __private int global_dim1,
|
||||
__global FLOAT* input0, __global FLOAT* input1, __global FLOAT* output,
|
||||
__private const int4 shape,//[N,H,W,C4]
|
||||
__private const int2 isFull) {
|
||||
__private const int2 isFull,
|
||||
__private const int activationType) {
|
||||
int2 pos = (int2)(get_global_id(0), get_global_id(1));//NC4, HW
|
||||
|
||||
if (pos.x < global_dim0 && pos.y < global_dim1) {
|
||||
|
|
@ -19,6 +20,9 @@ __kernel void binary_buf(__private int global_dim0, __private int global_dim1,
|
|||
in1 = (FLOAT4)(in1.x, in1.x, in1.x, in1.x);
|
||||
}
|
||||
FLOAT4 out = CONVERT_FLOAT4(OPERATOR);
|
||||
if(activationType == 1) {
|
||||
out = fmax(out, (FLOAT4)0);
|
||||
}
|
||||
vstore4(out, offset, output);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -27,9 +27,8 @@ static string swapComputeIn0In1(const string& computeOrigin) {
|
|||
return compute;
|
||||
}
|
||||
|
||||
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend,
|
||||
float operatorData, bool broadCast)
|
||||
: CommonExecution(backend), mCompute(compute), mBroadCast(broadCast), mOperatorData(operatorData) {
|
||||
EltwiseExecution::EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend)
|
||||
: CommonExecution(backend), mCompute(compute) {
|
||||
mBuildOptions.emplace("-DOPERATOR=" + compute);
|
||||
mOp = op;
|
||||
|
||||
|
|
@ -56,6 +55,10 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
auto runTime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
|
||||
int shape[4] = {outputShape[0], outputShape[1], outputShape[2], UP_DIV(outputShape[3], 4)};
|
||||
int fullCount[2] = {1, 1};
|
||||
int activationType = 0;
|
||||
if(mOp->type() == OpType_BinaryOp) {
|
||||
activationType = mOp->main_as_BinaryOp()->activationType();
|
||||
}
|
||||
|
||||
auto &unit = mUnits[0];
|
||||
unit.kernel = runTime->buildKernel("binary", "binary", mBuildOptions);
|
||||
|
|
@ -76,6 +79,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
unit.kernel.setArg(index++, openCLImage(output));
|
||||
unit.kernel.setArg(index++, shape);
|
||||
unit.kernel.setArg(index++, fullCount);
|
||||
unit.kernel.setArg(index++, activationType);
|
||||
|
||||
std::string name = "binary";
|
||||
mLocalWorkSize = localWS2DDefault(mGlobalWorkSize, mMaxWorkGroupSize, openCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
|
||||
|
|
@ -127,6 +131,7 @@ ErrorCode EltwiseExecution::onResize(const std::vector<Tensor *> &inputs, const
|
|||
unit.kernel.setArg(index++, openCLImage(output));
|
||||
unit.kernel.setArg(index++, shape);
|
||||
unit.kernel.setArg(index++, fullCount);
|
||||
unit.kernel.setArg(index++, activationType);
|
||||
|
||||
if(i == 0) {
|
||||
std::string name = "binary";
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ namespace OpenCL {
|
|||
|
||||
class EltwiseExecution : public CommonExecution {
|
||||
public:
|
||||
EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend, float operatorData = 0.0001f, bool broadCast = false);
|
||||
EltwiseExecution(const std::vector<Tensor *> &inputs, const std::string &compute, const MNN::Op *op, Backend *backend);
|
||||
virtual ~EltwiseExecution() = default;
|
||||
|
||||
uint32_t realSize(const Tensor* tensor);
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue