[Sync] Sync internal Gitlab

This commit is contained in:
xiaying 2022-02-18 11:30:27 +08:00
parent c4d9566171
commit 0c718e552b
193 changed files with 9361 additions and 2733 deletions

View File

@ -24,9 +24,14 @@ add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}")
add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}")
add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}")
# CMP0048 is related to letting CMake managing the package version for us
cmake_policy(SET CMP0048 NEW)
# Clear VERSION variables when no VERSION is given to project()
if(POLICY CMP0048)
cmake_policy(SET CMP0048 NEW)
endif()
# MSVC runtime library flags are selected by an abstraction.
if(POLICY CMP0091)
cmake_policy(SET CMP0091 NEW)
endif()
project(MNN VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD} LANGUAGES C CXX ASM)
# complier options
set(CMAKE_C_STANDARD 99)
@ -35,14 +40,6 @@ set(CMAKE_MODULE_PATH
${CMAKE_MODULE_PATH}
"${CMAKE_CURRENT_LIST_DIR}/cmake"
)
#add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
# COMMAND ${CMAKE_COMMAND} "-DNAMES=MNN"
# "-DMNN_SOURCE_DIR=${CMAKE_CURRENT_LIST_DIR}"
# "-DHEADER_FILE=${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
# -P "${CMAKE_CURRENT_LIST_DIR}/cmake/GenerateVersionFromVCS.cmake"
# COMMENT "Generating Version Control Info"
#)
#add_custom_target (GenVCSHDR DEPENDS "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h")
# Required for OpenCL/OpenGL/Vulkan CodeGen
include(FindPythonInterp REQUIRED)
# build options
@ -107,8 +104,8 @@ IF(WIN32)
SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
ENDIF()
ENDIF()
@ -118,13 +115,54 @@ IF( MNN_ENABLE_COVERAGE)
SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
ENDIF()
# do this before protobuf, make sure wincrt config of protobuf and MNN is same
if(MSVC)
# same as protobuf, otherwise config is inconsistent
if(CMAKE_VERSION VERSION_GREATER 3.15 OR CMAKE_VERSION VERSION_EQUAL 3.15)
set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>)
if(NOT MNN_WIN_RUNTIME_MT)
set(CMAKE_MSVC_RUNTIME_LIBRARY ${CMAKE_MSVC_RUNTIME_LIBRARY}DLL)
endif()
else()
foreach(flag_var
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if (MNN_WIN_RUNTIME_MT)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif()
else ()
if(${flag_var} MATCHES "/MT")
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
endif()
endif ()
endforeach()
endif()
set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
endif()
include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)
IF(MNN_BUILD_PROTOBUFFER)
IF(MNN_BUILD_CONVERTER)
IF(MSVC)
set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
IF((NOT MNN_BUILD_SHARED_LIBS) AND (NOT MNN_WIN_RUNTIME_MT))
message(FATAL_ERROR "When MNN_BUILD_CONVERTER=ON and MNN_BUILD_SHARED_LIBS=OFF, MNN_WIN_RUNTIME_MT must be ON. Because protobuf not support the config(static /MD)")
ENDIF()
ENDIF()
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/3rd_party/protobuf/cmake)
ENDIF()
ENDIF()
# specify source file encoding explicitly, fix cross-platform garbled output issue
# we need do this after protobuf which set different execution-charset
IF(MSVC)
set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
ENDIF()
IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
@ -206,26 +244,7 @@ message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
message(STATUS "\tHidden: ${MNN_HIDDEN}")
message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
if(MSVC)
if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
endif()
foreach(flag_var
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
if (MNN_WIN_RUNTIME_MT)
if(${flag_var} MATCHES "/MD")
string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endif()
else ()
if(${flag_var} MATCHES "/MT")
string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
endif()
endif ()
endforeach()
elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
add_definitions(-fPIC)
endif()
if(CMAKE_SYSTEM_NAME MATCHES "^Android")
@ -561,6 +580,9 @@ if (MNN_INTERNAL)
target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED)
include(${CMAKE_CURRENT_LIST_DIR}/source/internal/auth/CMakeLists.txt)
include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt)
if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
list(APPEND MNN_EXTRA_DEPENDS "-lcurl -lssl -lcrypto")
endif()
endif()
# Train
@ -661,7 +683,18 @@ if(APPLE)
endif()
add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCPU)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)
IF(WIN32 AND MNN_BUILD_CONVERTER AND MNN_BUILD_SHARED_LIBS)
# Because of dllimport/dllexport, we merge MNN and MNNConvertDeps together, which depend protobuf
target_link_libraries(MNN PUBLIC ${Protobuf_LIBRARIES})
ENDIF()
# Merge MNN/MNNExpress/MNNOpenCV and other backends into one .lib/.dll on Windows
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/cv)
IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
IF(MSVC)
target_compile_definitions(MNNOpenCV PRIVATE "-DBUILDING_MNN_DLL" INTERFACE "-DUSING_MNN_DLL")
ENDIF()
target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
ENDIF()
if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
# Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
@ -753,6 +786,10 @@ ELSE()
ARCHIVE DESTINATION lib
FRAMEWORK DESTINATION /Library/Frameworks/
)
if (NOT MNN_AAPL_FMWK)
INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)
endif()
FOREACH(HDR ${MNN_EXPR_PUB_HDRS})
SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/expr/ )
ENDFOREACH()

View File

@ -59,7 +59,17 @@ Interpreter consists of Engine and Backends. The former is responsible for the l
Scan the following QR codes to join Dingtalk discussion group. The group discussions are predominantly Chinese. But we welcome and will help English speakers.
See https://www.yuque.com/mnn/cn/feedback for dingtalk group barcodes.
Group #1 (Full):
<img src="doc/DingTalkQR1.png" height="256"/>
Group #2 (Full):
<img src="doc/DingTalkQR2.png" height="256"/>
Group #3:
<img src="doc/DingTalkQR3.png" height="256"/>
## License
Apache 2.0

View File

@ -56,7 +56,19 @@ Converter由Frontends和Graph Optimize构成。前者负责支持不同的训练
Interpreter由Engine和Backends构成。前者负责模型的加载、计算图的调度后者包含各计算设备下的内存分配、Op实现。在Engine和Backends中MNN应用了多种优化方案包括在卷积和反卷积中应用Winograd算法、在矩阵乘法中应用Strassen算法、低精度计算、Neon优化、手写汇编、多线程优化、内存复用、异构计算等。
## 社区交流与反馈
扫描二维码加入钉钉讨论群https://www.yuque.com/mnn/cn/feedback
扫描二维码加入钉钉讨论群。
一群(已满):
<img src="doc/DingTalkQR1.png" height="256"/>
二群(已满):
<img src="doc/DingTalkQR2.png" height="256"/>
三群:
<img src="doc/DingTalkQR3.png" height="256"/>
## License
Apache 2.0

View File

@ -18,6 +18,10 @@ IF(MNN_SEP_BUILD)
add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
endif()
target_link_libraries(MNN_Express MNN)
install(TARGETS MNN_Express
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
)
ELSE()
add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS})
ENDIF()

View File

@ -536,6 +536,7 @@ ErrorCode Executor::ComputeCache::compute() {
if (mShapeDirty) {
auto code = resize();
if (NO_ERROR != code) {
mShapeDirty = true;
return code;
}
}

View File

@ -116,6 +116,9 @@ Variable::Info* Expr::outputInfo(int index) const {
void Expr::_addLinkForInputs(EXPRP expr) {
auto inputs = expr->inputs();
for (int i=0; i<inputs.size(); ++i) {
if (inputs[i].get() == nullptr) {
continue;
}
bool findEmpty = false;
auto inputExpr = inputs[i]->mFrom;
for (int j=0; j<inputExpr->mTo.size(); ++j) {
@ -290,6 +293,10 @@ bool Expr::requireInfo() {
}
for (int i = 0; i < mInputs.size(); ++i) {
auto& v = mInputs[i];
if (v->getInfo()->size == 0) {
// zero shape
continue;
}
if (mInside->mReq.shapeNeedContent[i]) {
// For shape need content, the content must not be nullptr
auto ptr = v->readInternal(true);
@ -338,6 +345,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
return;
}
for (auto input : old->inputs()) {
if (input.get() == nullptr) {
continue;
}
for (int j=0; j<input->mFrom->mTo.size(); ++j) {
auto ref = input->mFrom->mTo[j].lock();
if (ref.get() == old.get()) {
@ -346,6 +356,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
}
}
for (auto input : from->inputs()) {
if (input.get() == nullptr) {
continue;
}
bool hasSet = false;
for (int j=0; j<input->mFrom->mTo.size(); ++j) {
auto ref = input->mFrom->mTo[j].lock();
@ -567,6 +580,9 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
return;
}
for (int i = 0; i < expr->inputs().size(); ++i) {
if (expr->inputs()[i].get() == nullptr) {
continue;
}
visit(expr->inputs()[i]->mFrom, before, after);
}
after(expr);
@ -721,6 +737,9 @@ void Expr::visitOutputs(const std::function<bool(EXPRP, int)>& visit) {
bool recurse = false;
auto inputs = expr->inputs();
for (int i=0; i<inputs.size(); ++i) {
if (inputs[i].get() == nullptr) {
continue;
}
if (inputs[i]->mFrom.get() == this) {
recurse = recurse || visit(expr, i);
}
@ -924,6 +943,10 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
op->name = expr->name();
op->inputIndexes.resize(expr->inputs().size());
for (int i = 0; i < op->inputIndexes.size(); ++i) {
if (expr->inputs()[i] == nullptr) {
op->inputIndexes[i] = -1;
continue;
}
auto inputExpr = expr->inputs()[i]->expr();
op->inputIndexes[i] = varIndexInfo[inputExpr.first] + inputExpr.second;
}

View File

@ -1119,6 +1119,14 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape) {
return (Variable::create(Expr::create(std::move(op), {indices, updates, shape})));
}
VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input) {
std::unique_ptr<OpT> op(new OpT);
op->main.type = OpParameter_NONE;
op->type = OpType_ScatterNd;
op->main.value = nullptr;
return (Variable::create(Expr::create(std::move(op), {indices, updates, shape, input})));
}
VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis) {
std::unique_ptr<OpT> op(new OpT);
op->type = OpType_OneHot;

View File

@ -581,6 +581,22 @@ VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided, int32_t begin
op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
return (Variable::create(Expr::create(op.get(), {input, begin, end, strided})));
}
VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write, int32_t beginMask,
int32_t endMask, int32_t ellipsisMask, int32_t newAxisMask, int32_t shrinkAxisMask) {
std::unique_ptr<OpT> op(new OpT);
op->type = OpType_StridedSlice;
op->main.type = OpParameter_StridedSliceParam;
op->main.value = new StridedSliceParamT;
op->main.AsStridedSliceParam()->T = DataType_DT_FLOAT;
op->main.AsStridedSliceParam()->beginMask = beginMask;
op->main.AsStridedSliceParam()->endMask = endMask;
op->main.AsStridedSliceParam()->ellipsisMask = ellipsisMask;
op->main.AsStridedSliceParam()->newAxisMask = newAxisMask;
op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
return (Variable::create(Expr::create(op.get(), {input, begin, end, strided, write})));
}
/*Transposes x.
Args:
x: A variable.
@ -1830,5 +1846,57 @@ VARP _Where(VARP x) {
return (Variable::create(Expr::create(std::move(op), {x})));
}
VARP _Sort(VARP x, int axis, bool arg, bool descend) {
std::unique_ptr<OpT> op(new OpT);
op->type = OpType_TopKV2;
op->main.type = OpParameter_TopKV2;
auto topk = new TopKV2T;
topk->largest = descend;
op->main.value = topk;
auto shape = x->getInfo()->dim;
axis = axis < 0 ? shape.size() + axis : axis;
int k = x->getInfo()->dim[axis];
std::vector<VARP> inputs {x, _Scalar(k)};
if (axis + 1 != shape.size()) {
inputs.push_back(_Scalar(axis));
}
auto expr = Expr::create(op.get(), inputs, 2);
return Variable::create(expr, arg);
}
VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& region, const std::vector<int>& shape) {
std::unique_ptr<MNN::OpT> op(new MNN::OpT);
op->type = OpType_Raster;
auto extra = new ExtraT;
// set shape
std::unique_ptr<AttributeT> shapeAttr(new AttributeT);
shapeAttr->key = "shape";
shapeAttr->list.reset(new ListValueT);
shapeAttr->list->i = shape;
extra->attr.push_back(std::move(shapeAttr));
// set region
std::unique_ptr<AttributeT> regionAttr(new AttributeT);
regionAttr->key = "region";
regionAttr->list.reset(new ListValueT);
regionAttr->list->i = region;
extra->attr.push_back(std::move(regionAttr));
op->main.type = OpParameter_Extra;
op->main.value = extra;
return (Variable::create(Expr::create(std::move(op), vars)));
}
VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold, float scoreThreshold) {
std::unique_ptr<MNN::OpT> op(new MNN::OpT);
op->type = OpType_NonMaxSuppressionV2;
std::vector<VARP> vars {boxes, scores, _Scalar(maxDetections)};
if (iouThreshold >= 0) {
vars.push_back(_Scalar(iouThreshold));
}
if (scoreThreshold >= 0) {
vars.push_back(_Scalar(scoreThreshold));
}
return (Variable::create(Expr::create(std::move(op), vars)));
}
} // namespace Express
} // namespace MNN

View File

@ -166,7 +166,8 @@ public:
return mModule->onForward(inputs);
}
virtual Module* clone(CloneContext* ctx) const override {
NetModule* module(new NetModule(mModule, mInfo));
std::shared_ptr<Module> submodule(mModule->clone(ctx));
NetModule* module(new NetModule(submodule, mInfo));
return this->cloneBaseTo(ctx, module);
}
const Module::Info* info() const {
@ -223,9 +224,9 @@ static void _loadInputs(Module::Info* info, const std::vector<std::string>& inpu
}
}
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config) {
Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config) {
// Check if runtime is valid
if (nullptr != rtMgr && rtMgr->getRuntimeInfo().first.empty()) {
if (nullptr != _rtMgr && _rtMgr->getRuntimeInfo().first.empty()) {
MNN_ERROR("Invalid runtime\n");
return nullptr;
}
@ -269,6 +270,17 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
#endif // MNN_INTERNAL_ENABLED
std::shared_ptr<Info> info(new Info);
auto rtMgr = _rtMgr;
Module::Config defaultConfig;
if (nullptr == config) {
config = &defaultConfig;
}
if(nullptr == rtMgr && config->backend != nullptr) {
ScheduleConfig sche_config;
sche_config.type = config->backend->type;
sche_config.backendConfig = config->backend->config;
rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config));
}
if ((!inputs.empty()) && (!outputs.empty())) {
_loadInputs(info.get(), inputs, net);
info->runTimeManager = rtMgr;

View File

@ -16,7 +16,7 @@ public:
// Do nothing
}
virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
static NMSModule* create(const Op* op);
MNN_PUBLIC static NMSModule* create(const Op* op);
private:
NMSModule(){}

View File

@ -61,6 +61,7 @@ public:
/** edge wrapper */
Wrap wrap = CLAMP_TO_EDGE;
bool draw = false;
};
public:
@ -148,6 +149,18 @@ public:
void setPadding(uint8_t value) {
mPaddingValue = value;
}
/**
* @brief draw color to regions of img.
* @param img the image to draw.
* @param w the image's width.
* @param h the image's height.
* @param c the image's channel.
* @param regions the regions to draw, size is [num * 3] contain num x { y, xl, xr }
* @param num regions num
* @param color the color to draw.
* @return void.
*/
void draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color);
private:
ImageProcess(const Config& config);
Matrix mTransform;

View File

@ -154,7 +154,7 @@ public:
* @param keySize depercerate, for future use.
*/
void setCacheFile(const char* cacheFile, size_t keySize = 128);
/**
* @brief The API shoud be called after last resize session.
* If resize session generate new cache info, try to rewrite cache file.
@ -357,6 +357,12 @@ public:
*/
const char* bizCode() const;
/**
* @brief get model UUID
* @return Model UUID.
*/
const char* uuid() const;
private:
static Interpreter* createFromBufferInternal(Content* net);

View File

@ -70,7 +70,7 @@ public:
return mDebug.get();
}
struct Cache;
class RuntimeManager {
class MNN_PUBLIC RuntimeManager {
public:
~RuntimeManager();
/**

View File

@ -124,6 +124,7 @@ MNN_PUBLIC VARP _ArgMin(VARP input, int axis = 0);
MNN_PUBLIC VARP _BatchMatMul(VARP x, VARP y, bool adj_x = false, bool adj_y = false);
MNN_PUBLIC VARP _UnravelIndex(VARP indices, VARP dims);
MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape);
MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input);
MNN_PUBLIC VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis = -1);
MNN_PUBLIC VARP _BroadcastTo(VARP a, VARP shape);
MNN_PUBLIC VARP _LinSpace(VARP start, VARP stop, VARP num);

View File

@ -63,8 +63,11 @@ MNN_PUBLIC VARP _Softsign(VARP features);
MNN_PUBLIC std::vector<VARP> _Split(VARP value, INTS size_splits, int axis = 0);
MNN_PUBLIC VARP _Slice(VARP x, VARP starts, VARP sizes);
MNN_PUBLIC VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided,
int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
int32_t newAxisMask, int32_t shrinkAxisMask);
int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
int32_t newAxisMask, int32_t shrinkAxisMask);
MNN_PUBLIC VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write,
int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
int32_t newAxisMask, int32_t shrinkAxisMask);
MNN_PUBLIC VARP _Concat(VARPS values, int axis);
MNN_PUBLIC VARP _Convert(VARP input, Dimensionformat format);
MNN_PUBLIC VARP _Transpose(VARP x, INTS perm);
@ -155,6 +158,9 @@ MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1);
MNN_PUBLIC std::vector<VARP> _TopKV2(VARP input0, VARP input1);
MNN_PUBLIC VARP _ImageProcess(VARP input, CV::ImageProcess::Config config, CV::Matrix matrix, int oh, int ow, int oc, int dtype, uint8_t padVal = 0);
MNN_PUBLIC VARP _Where(VARP x);
MNN_PUBLIC VARP _Sort(VARP x, int axis = -1, bool arg = false, bool descend = false);
MNN_PUBLIC VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& regions, const std::vector<int>& shape);
MNN_PUBLIC VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold = -1, float scoreThreshold = -1);
} // namespace Express
} // namespace MNN

View File

@ -21,13 +21,13 @@ done
rm -rf $path && mkdir -p $path
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf pymnn_build && mkdir pymnn_build
pushd pymnn_build
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j24
popd
pushd pymnn/pip_package

View File

@ -19,25 +19,27 @@ while getopts "o:p:v:b" opt; do
esac
done
export MACOSX_DEPLOYMENT_TARGET=10.11
./schema/generate.sh
rm -rf $path && mkdir -p $path
PACKAGE_PATH=$(realpath $path)
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
if [ ! -z $opencl ]; then
CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
fi
rm -rf pymnn_build && mkdir pymnn_build
pushd pymnn_build
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j8
popd
pushd pymnn/pip_package
echo -e "__version__ = '$mnn_version'" > MNN/version.py
rm -rf build && mkdir build
rm -rf dist && mkdir dist
if [ -z $python_versions ]; then
if [ -z "$python_versions" ]; then
python build_wheel.py --version $mnn_version
else
for env in $python_versions; do

View File

@ -1,66 +1,63 @@
# MNNPyBridge
# |-- Debug
# | |--- MD
# | |--- MT
# | |--- Static
# |
# |-- Release
# |--- MD
# |--- MT
# |--- Static
# |-- include
# |-- wrapper
# |-- test (Release + Dynamic + MD)
# |-- x64
# |-- x86
# |-- lib
# |-- x64
# | |-- (Debug/Release x Dynamic/Static x MD/MT)
# |
# |-- x86
# |-- (Debug/Release x Dynamic/Static x MD/MT)
Param(
[Parameter(Mandatory=$true)][String]$version,
[Parameter(Mandatory=$true)][String]$pyc_env,
[Parameter(Mandatory=$true)][String]$mnn_path,
[Parameter(Mandatory=$true)][String]$python_path,
[Parameter(Mandatory=$true)][String]$numpy_path,
[Parameter(Mandatory=$true)][String]$path,
[Switch]$train_api,
[Switch]$x86
)
# build process may failed because of lnk1181, but be success when run again
# Run expr, return if success, otherwise try again until try_times
function Retry([String]$expr, [Int]$try_times) {
$cnt = 0
do {
$cnt++
try {
Invoke-Expression $expr
return
} catch { }
} while($cnt -lt $try_times)
throw "Failed: $expr"
# build it according to cmake_cmd, exit 1 when any error occur
function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
Invoke-Expression $cmake_cmd
# build process may failed because of lnk1181, but be success when run again
$try_times = 2
if ($LastExitCode -eq 0) {
For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
try {
Invoke-Expression $ninja_cmd
if ($LastExitCode -eq 0) {
return
}
} catch {}
}
}
popd
exit 1
}
$erroractionpreference = "stop"
mkdir -p $path -ErrorAction Ignore
$PACKAGE_PATH = $(Resolve-Path $path).Path
$PACKAGE_LIB_PATH = "$PACKAGE_PATH\lib"
if ($x86) {
$PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x86"
} else {
$PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x64"
}
$MNN_PACKAGE_PATH = $(Resolve-Path $mnn_path).Path
pushd pymnn\3rd_party
Remove-Item MNN -Recurse -ErrorAction Ignore
mkdir -p MNN\lib
cp -r $MNN_PACKAGE_PATH\* MNN\lib
cp -r ..\..\include MNN
popd
$arch = $(If($x86) {"x86"} Else {"x64"})
$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$arch"
$TEST_TOOL_PATH = "$PACKAGE_PATH/test/$arch"
#clear and create package directory
powershell ./schema/generate.ps1
pushd $PACKAGE_PATH
Remove-Item include -Recurse -ErrorAction Ignore
Remove-Item wrapper -Recurse -ErrorAction Ignore
mkdir -p include
mkdir -p wrapper
mkdir -p $PACKAGE_LIB_PATH\Debug\MD -ErrorAction SilentlyContinue
mkdir -p $PACKAGE_LIB_PATH\Debug\MT -ErrorAction SilentlyContinue
mkdir -p $PACKAGE_LIB_PATH\Debug\Static -ErrorAction SilentlyContinue
mkdir -p $PACKAGE_LIB_PATH\Release\MD -ErrorAction SilentlyContinue
mkdir -p $PACKAGE_LIB_PATH\Release\MT -ErrorAction SilentlyContinue
mkdir -p $PACKAGE_LIB_PATH\Release\Static -ErrorAction SilentlyContinue
Remove-Item -Path include, wrapper -Recurse -ErrorAction Ignore
mkdir -p include, wrapper
popd
Remove-Item -Path $PACKAGE_LIB_PATH, $TEST_TOOL_PATH -Recurse -ErrorAction Ignore
mkdir -p $PACKAGE_LIB_PATH, $TEST_TOOL_PATH
pushd $PACKAGE_LIB_PATH
mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
popd
# assume $PACKAGE_PATH exist
@ -71,8 +68,16 @@ cp -r pymnn\pip_package\MNN pymnn_pyc_tmp
pushd pymnn_pyc_tmp
Remove-Item MNN -Include __pycache__ -Recurse
pushd MNN
rm -r -force tools
(Get-Content __init__.py).replace('from . import tools', '') | Set-Content __init__.py
function Remove([String]$module) {
rm -r -force $module
(Get-Content __init__.py).replace("from . import $module", "") | Set-Content __init__.py
}
Remove "tools"
if (!$train_api) {
Remove "data"
Remove "optim"
}
popd
popd
conda activate $pyc_env
@ -83,59 +88,108 @@ Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'"
cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force
rm -r -force pymnn_pyc_tmp
$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_TRAIN_API=ON"
$mnn_path = $(Resolve-Path $mnn_path).Path
$python_path = $(Resolve-Path $python_path).Path
$numpy_path = $(Resolve-Path $numpy_path).Path
$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_BUILD_TEST=OFF"
if ($train_api) {
$CMAKE_ARGS = "$CMAKE_ARGS -DPYMNN_TRAIN_API=ON"
}
$CMAKE_ARGS = "$CMAKE_ARGS -Dmnn_path=$mnn_path -Dpython_path=$python_path -Dnumpy_path=$numpy_path"
Remove-Item pymnn_build -Recurse -ErrorAction Ignore
mkdir pymnn_build
pushd pymnn_build
##### Debug/MT ####
#Remove-Item CMakeCache.txt -ErrorAction Ignore
#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
#Retry "ninja" 2
#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MT
#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MT
#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
#rm mnnpybridge.*
function exist([String]$build_type, [String]$lib_type, [String]$crt_type) {
function _exist([String]$lib) {
$lib_dir = "$lib/lib/$arch/$build_type/$lib_type/$crt_type"
return $((Test-Path -Path $lib_dir) -and ((Get-ChildItem -Path "$lib_dir/*" -Include "*.lib").Count -ne 0))
}
return $((_exist $mnn_path) -and (_exist $python_path) -and (_exist $numpy_path))
}
##### Debug/MD ####
#Remove-Item CMakeCache.txt -ErrorAction Ignore
#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
#Retry "ninja" 2
#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MD
#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MD
#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
#rm mnnpybridge.*
function log([String]$msg) {
echo "================================"
echo "Build MNNPyBridge $msg"
echo "================================"
}
##### Debug/Static ####
#Remove-Item CMakeCache.txt -ErrorAction Ignore
#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
#Retry "ninja" 2
#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static
#rm mnnpybridge.*
##### Debug/Dynamic/MT ####
if (exist Debug Dynamic MT) {
log "Debug/Dynamic/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
rm mnnpybridge.*
}
##### Release/MT ####
#Remove-Item CMakeCache.txt -ErrorAction Ignore
#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
#Retry "ninja" 2
#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MT
#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MT
#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MT
#rm mnnpybridge.*
##### Debug/Dynamic/MD ####
if (exist Debug Dynamic MD) {
log "Debug/Dynamic/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug ../pymnn"
cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
rm mnnpybridge.*
}
##### Release/MD ####
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
Retry "ninja" 2
cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MD
cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MD
cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MD
rm mnnpybridge.*
##### Debug/Static/MT ####
if (exist Debug Static MT) {
log "Debug/Static/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MT
rm mnnpybridge.*
}
##### Release/Static ####
#Remove-Item CMakeCache.txt -ErrorAction Ignore
#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
#Retry "ninja" 2
#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static
##### Debug/Static/MD ####
if (exist Debug Static MD) {
log "Debug/Static/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MD
rm mnnpybridge.*
}
##### Release/Dynamic/MT ####
if (exist Release Dynamic MT) {
log "Release + MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
rm mnnpybridge.*
}
##### Release/Dynamic/MD ####
if (exist Release Dynamic MD) {
log "Release + MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release ../pymnn"
cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
#cp mnnpybridge_test.exe $TEST_TOOL_PATH
#cp $mnn_path/lib/$arch/Release/MD/MNN.dll $TEST_TOOL_PATH
#cp $python_path/lib/$arch/Release/MD/python.dll $TEST_TOOL_PATH
#cp $numpy_path/lib/$arch/Release/MD/numpy_python.dll $TEST_TOOL_PATH
rm mnnpybridge.*
}
##### Release/Static/MT ####
if (exist Release Static MT) {
log "Release/Static/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MT
rm mnnpybridge.*
}
##### Release/Static/MD ####
if (exist Release Static MD) {
log "Release/Static/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MD
rm mnnpybridge.*
}
popd

View File

@ -1,49 +1,47 @@
# MNN
# |-- Debug
# | |--- MD
# | |--- MT
# | |--- Static
# |
# |-- Release
# |--- MD
# |--- MT
# |--- Static
# |-- include
# |-- lib
# |-- Debug
# | |--- Dynamic
# | | |--- MD
# | | |--- MT
# | |
# | |--- Static
# | |--- MD
# | |--- MT
# |
# |-- Release
# |--- Dynamic
# | |--- MD
# | |--- MT
# |
# |--- Static
# |--- MD
# |--- MT
#
Param(
[Parameter(Mandatory=$true)][String]$path,
[String]$backends
[String]$backends,
[Switch]$x86
)
# build process may failed because of lnk1181, but be success when run again
# Run expr, return if success, otherwise try again until try_times
function Retry([String]$expr, [Int]$try_times) {
$cnt = 0
do {
$cnt++
try {
Invoke-Expression $expr
return
} catch { }
} while($cnt -lt $try_times)
throw "Failed: $expr"
}
$erroractionpreference = "stop"
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
New-Item -Path $path -ItemType Directory -ErrorAction Ignore
$PACKAGE_PATH = $(Resolve-Path $path).Path
$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$(If ($x86) {"x86"} Else {"x64"})"
Remove-Item -Path $PACKAGE_LIB_PATH -Recurse -ErrorAction Ignore
mkdir -p $PACKAGE_LIB_PATH
#clear and create package directory
powershell ./schema/generate.ps1
pushd $PACKAGE_PATH
mkdir -p Debug\MD
mkdir -p Debug\MT
mkdir -p Debug\Static
mkdir -p Release\MD
mkdir -p Release\MT
mkdir -p Release\Static
Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore
cp -r include $PACKAGE_PATH
cp -r tools/cv/include/cv $PACKAGE_PATH/include
pushd $PACKAGE_LIB_PATH
mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
popd
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON"
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
if ($backends -ne $null) {
Foreach ($backend in $backends.Split(",")) {
if ($backend -eq "opencl") {
@ -58,53 +56,83 @@ Remove-Item build -Recurse -ErrorAction Ignore
mkdir build
pushd build
##### Debug/MT ####
function log([String]$msg) {
echo "================================"
echo "Build MNN (CPU $backends) $msg"
echo "================================"
}
# build it according to cmake_cmd, exit 1 when any error occur
function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja MNN") {
Invoke-Expression $cmake_cmd
# build process may failed because of lnk1181, but be success when run again
$try_times = 2
if ($LastExitCode -eq 0) {
For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
try {
Invoke-Expression $ninja_cmd
if ($LastExitCode -eq 0) {
return
}
} catch {}
}
}
popd
exit 1
}
##### Debug/Dynamic/MT ####
log "Debug/Dynamic/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Debug\MT
cp MNN.dll $PACKAGE_PATH\Debug\MT
cp MNN.pdb $PACKAGE_PATH\Debug\MT
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MT
rm MNN.*
##### Debug/MD ####
##### Debug/Dynamic/MD ####
log "Debug/Dynamic/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Debug\MD
cp MNN.dll $PACKAGE_PATH\Debug\MD
cp MNN.pdb $PACKAGE_PATH\Debug\MD
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MD
rm MNN.*
##### Debug/Static ####
##### Debug/Static/MT ####
log "Debug/Static/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Debug\Static
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MT
rm MNN.*
##### Release/MT ####
##### Debug/Static/MD ####
log "Debug/Static/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Release\MT
cp MNN.dll $PACKAGE_PATH\Release\MT
cp MNN.pdb $PACKAGE_PATH\Release\MT
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MD
rm MNN.*
##### Release/MD ####
##### Release/Dynamic/MT ####
log "Release/Dynamic/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Release\MD
cp MNN.dll $PACKAGE_PATH\Release\MD
cp MNN.pdb $PACKAGE_PATH\Release\MD
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
rm MNN.*
##### Release/Static ####
##### Release/Dynamic/MD ####
log "Release/Dynamic/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
Retry "ninja" 2
cp MNN.lib $PACKAGE_PATH\Release\Static
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
rm MNN.*
##### Release/Static/MT ####
log "Release/Static/MT"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MT
##### Release/Static/MD ####
log "Release/Static/MD"
Remove-Item CMakeCache.txt -ErrorAction Ignore
Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MD
popd

View File

@ -1,5 +1,6 @@
Param(
[Parameter(Mandatory=$true)][String]$path,
[Switch]$dynamic_link,
[String]$backends,
[Switch]$build_all,
[Switch]$build_train, # MNN_BUILD_TRAIN
@ -23,20 +24,6 @@ if ($build_all) {
$build_demo = $true
}
# build process may failed because of lnk1181, but be success when run again
# Run expr, return if success, otherwise try again until try_times
function Retry([String]$expr, [Int]$try_times) {
$cnt = 0
do {
$cnt++
try {
Invoke-Expression $expr
return
} catch { }
} while($cnt -lt $try_times)
throw "Failed: $expr"
}
$erroractionpreference = "stop"
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
@ -44,7 +31,12 @@ $TOOLS_PATH = $(Resolve-Path $path).Path
powershell ./schema/generate.ps1
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF"
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
if ($dynamic_link) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=ON"
} else {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON"
}
if ($build_train) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_TRAIN=ON"
}
@ -59,6 +51,11 @@ if ($build_evaluation) {
}
if ($build_converter) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_CONVERTER=ON"
if ($dynamic_link) {
$CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=ON"
} else {
$CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=OFF"
}
}
if ($build_benchmark) {
$CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_BENCHMARK=ON"
@ -83,37 +80,37 @@ Remove-Item build -Recurse -ErrorAction Ignore
mkdir build
pushd build
# build it according to cmake_cmd, exit 1 when any error occur
function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
Invoke-Expression $cmake_cmd
# build process may failed because of lnk1181, but be success when run again
$try_times = 2
if ($LastExitCode -eq 0) {
For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
try {
Invoke-Expression $ninja_cmd
if ($LastExitCode -eq 0) {
return
}
} catch {}
}
}
popd
exit 1
}
Remove-Item CMakeCache.txt -ErrorAction Ignore
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
Retry "ninja" 2
Build "cmake -G Ninja $CMAKE_ARGS .."
$PRODUCTS = ""
if ($build_train) {
$PRODUCTS = "$PRODUCTS transformer.out.exe train.out.exe rawDataTransform.out.exe dataTransformer.out.exe runTrainDemo.out.exe"
}
if ($build_tools) {
$PRODUCTS = "$PRODUCTS MNNV2Basic.out.exe mobilenetTest.out.exe backendTest.out.exe testModel.out.exe testModelWithDescrisbe.out.exe getPerformance.out.exe checkInvalidValue.out.exe timeProfile.out.exe"
}
if ($build_quantools) {
$PRODUCTS = "$PRODUCTS quantized.out.exe quantized_model_optimize.out.exe"
}
if ($build_evaluation) {
$PRODUCTS = "$PRODUCTS classficationTopkEval.out.exe"
}
if ($build_converter) {
$PRODUCTS = "$PRODUCTS MNNDump2Json.exe MNNConvert.exe"
}
if ($build_benchmark) {
$PRODUCTS = "$PRODUCTS benchmark.out.exe benchmarkExprModels.out.exe"
}
if ($build_test) {
$PRODUCTS = "$PRODUCTS run_test.out.exe"
}
if ($build_demo) {
$PRODUCTS = "$PRODUCTS pictureRecognition.out.exe pictureRotate.out.exe multiPose.out.exe segment.out.exe expressDemo.out.exe transformerDemo.out.exe rasterDemo.out.exe"
$PRODUCTS = $(Get-ChildItem -Path . -Include "*.exe" -Name)
if ($dynamic_link) {
$PRODUCTS = "$PRODUCTS MNN.dll"
if ($build_converter) {
$PRODUCTS = "$PRODUCTS ./3rd_party/protobuf/cmake/libprotobuf.dll"
}
}
Foreach ($PRODUCT in $PRODUCTS.Split(" ")) {
Foreach ($PRODUCT in $PRODUCTS.Trim().Split()) {
Invoke-Expression "cp $PRODUCT $TOOLS_PATH"
}

View File

@ -6,25 +6,28 @@ Param(
[Switch]$x86
)
# build process may failed because of lnk1181, but be success when run again
# Run expr, return if success, otherwise try again until try_times
function Retry([String]$expr, [Int]$try_times) {
$cnt = 0
do {
$cnt++
try {
Invoke-Expression $expr
return
} catch { }
} while($cnt -lt $try_times)
throw "Failed: $expr"
# build it according to cmake_cmd, exit 1 when any error occur
function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
Invoke-Expression $cmake_cmd
# build process may failed because of lnk1181, but be success when run again
$try_times = 2
if ($LastExitCode -eq 0) {
For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
try {
Invoke-Expression $ninja_cmd
if ($LastExitCode -eq 0) {
return
}
} catch {}
}
}
exit 1
}
$erroractionpreference = "stop"
$python_versions = $pyenvs.Split(",")
Remove-Item $path -Recurse -ErrorAction Ignore
mkdir -p $path
New-Item -Path $path -ItemType Directory -ErrorAction Ignore
$PACKAGE_PATH = $(Resolve-Path $path).Path
$ARGS = "--version $version"
if ($x86) {
@ -37,7 +40,7 @@ powershell ./schema/generate.ps1
Remove-Item pymnn_build -Recurse -ErrorAction Ignore
mkdir pymnn_build
pushd pymnn_build
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
if ($backends -ne $null) {
Foreach($backend in $backends.Split(",")) {
if ($backend -eq "opencl") {
@ -47,8 +50,7 @@ if ($backends -ne $null) {
}
}
}
Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
Retry "ninja MNN MNNTrain MNNConvert" 2
Build "cmake -G Ninja $CMAKE_ARGS .." "ninja MNN MNNTrain MNNConvert MNNOpenCV"
popd
pushd pymnn/pip_package
@ -59,12 +61,15 @@ mkdir dist
mkdir build
if ($pyenvs -eq $null) {
Retry "python build_wheel.py $ARGS" 2
Invoke-Expression "python build_wheel.py $ARGS"
} else {
Foreach ($env in $pyenvs.Split(",")) {
Invoke-Expression "conda activate $env"
Retry "python build_wheel.py $ARGS" 2
Invoke-Expression "conda deactivate"
Invoke-Expression "python build_wheel.py $ARGS"
conda deactivate
if ($LastExitCode -ne 0) {
exit 1
}
}
}

View File

@ -748,6 +748,7 @@
EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; };
EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; };
EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */ = {isa = PBXBuildFile; fileRef = F41497D6278D8A21004A363A /* RuntimeAttr.hpp */; };
F4FB5AD7274E6CC100EAF0C1 /* MNNAESCipher.h in Headers */ = {isa = PBXBuildFile; fileRef = F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */; };
F4FB5AD8274E6CC100EAF0C1 /* ModelAuth.mm in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */; };
F4FB5AD9274E6CC100EAF0C1 /* MNNAESCipher.m in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */; };
@ -1542,6 +1543,7 @@
EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = "<group>"; };
EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = "<group>"; };
EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = "<group>"; };
F41497D6278D8A21004A363A /* RuntimeAttr.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = RuntimeAttr.hpp; sourceTree = "<group>"; };
F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAESCipher.h; sourceTree = "<group>"; };
F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ModelAuth.mm; sourceTree = "<group>"; };
F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = MNNAESCipher.m; sourceTree = "<group>"; };
@ -1679,6 +1681,7 @@
48593FB423A89B2F0069452A /* express */ = {
isa = PBXGroup;
children = (
F41497D6278D8A21004A363A /* RuntimeAttr.hpp */,
489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */,
48C84B6F250F711600EE7666 /* module */,
48FA474C23AA136300172C3B /* MergeOptimizer.cpp */,
@ -2951,6 +2954,7 @@
92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
@ -2985,6 +2989,7 @@
buildConfigurationList = 0F1465BF1FA18D1000F9860A /* Build configuration list for PBXNativeTarget "MNN" */;
buildPhases = (
0F1465B41FA18D1000F9860A /* Headers */,
F48DED4627742886004B8DB0 /* ShellScript */,
0F1465B21FA18D1000F9860A /* Sources */,
0F1465B31FA18D1000F9860A /* Frameworks */,
0F1465B51FA18D1000F9860A /* Resources */,
@ -3091,6 +3096,23 @@
shellPath = /bin/sh;
shellScript = "\necho \"==========\"\necho ${TARGET_NAME}\necho ${PROJECT_FILE_PATH}\necho ${TARGET_BUILD_DIR}\n\ntouch ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib\ncp ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib ${TARGET_BUILD_DIR}/Playground.app/\n";
};
F48DED4627742886004B8DB0 /* ShellScript */ = {
isa = PBXShellScriptBuildPhase;
buildActionMask = 2147483647;
files = (
);
inputFileListPaths = (
);
inputPaths = (
);
outputFileListPaths = (
);
outputPaths = (
);
runOnlyForDeploymentPostprocessing = 0;
shellPath = /bin/sh;
shellScript = "# Type a script or drag a script file from your workspace to insert its path.\nMNN_REVISION=`git rev-parse HEAD`\necho \"#define MNN_REVISION \\\"${MNN_REVISION}\\\"\" > ${SRCROOT}/../../include/MNN/VCS.h\n";
};
/* End PBXShellScriptBuildPhase section */
/* Begin PBXSourcesBuildPhase section */
@ -3808,7 +3830,7 @@
CODE_SIGN_STYLE = Automatic;
DEAD_CODE_STRIPPING = YES;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = 6T3QR3X696;
DEVELOPMENT_TEAM = UMNWSVYR5X;
DYLIB_COMPATIBILITY_VERSION = 1;
DYLIB_CURRENT_VERSION = 1;
DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3854,7 +3876,7 @@
METAL_LIBRARY_FILE_BASE = mnn;
ONLY_ACTIVE_ARCH = YES;
OTHER_CFLAGS = "";
PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
PROVISIONING_PROFILE_SPECIFIER = "";
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -3875,7 +3897,7 @@
CODE_SIGN_STYLE = Automatic;
DEAD_CODE_STRIPPING = YES;
DEFINES_MODULE = YES;
DEVELOPMENT_TEAM = 6G7464HHUS;
DEVELOPMENT_TEAM = UMNWSVYR5X;
DYLIB_COMPATIBILITY_VERSION = 1;
DYLIB_CURRENT_VERSION = 1;
DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3919,7 +3941,7 @@
MACH_O_TYPE = staticlib;
METAL_LIBRARY_FILE_BASE = mnn;
OTHER_CFLAGS = "";
PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
PROVISIONING_PROFILE_SPECIFIER = "";
"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -3938,7 +3960,7 @@
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
CODE_SIGN_STYLE = Automatic;
DEVELOPMENT_TEAM = 6G7464HHUS;
DEVELOPMENT_TEAM = UMNWSVYR5X;
GCC_ENABLE_CPP_EXCEPTIONS = NO;
GCC_ENABLE_CPP_RTTI = NO;
HEADER_SEARCH_PATHS = (
@ -3963,7 +3985,7 @@
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
CODE_SIGN_STYLE = Automatic;
DEVELOPMENT_TEAM = 6G7464HHUS;
DEVELOPMENT_TEAM = UMNWSVYR5X;
GCC_ENABLE_CPP_EXCEPTIONS = NO;
GCC_ENABLE_CPP_RTTI = NO;
HEADER_SEARCH_PATHS = (

View File

@ -9,37 +9,50 @@
#import "AppDelegate.h"
#import "MNNTestSuite.h"
#include <MNN/MNNForwardType.h>
#include <MNN/Interpreter.hpp>
#import <MNN/expr/Executor.hpp>
#import "benchmark.h"
@implementation AppDelegate
- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
#define UNITTEST
#ifdef UNITTEST
// unittest
{
MNN::BackendConfig config;
// If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
int precisionInTestUtil =
getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
MNNTestSuite::runAll(precisionInTestUtil);
}
#endif
#ifdef BENCHMARK
// benchmark
{
auto bundle = CFBundleGetMainBundle();
auto url = CFBundleCopyBundleURL(bundle);
auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
CFRelease(url);
auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
auto res = std::string(cstring) + "/models";
CFRelease(string);
iosBenchAll(res.c_str());
}
#endif
//#define UNITTEST
//#ifdef UNITTEST
// // unittest
// {
// MNN::BackendConfig config;
// // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
// MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
// int precisionInTestUtil =
// getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
// MNNTestSuite::runAll(precisionInTestUtil);
// }
//#endif
//#ifdef BENCHMARK
// // benchmark
// {
// auto bundle = CFBundleGetMainBundle();
// auto url = CFBundleCopyBundleURL(bundle);
// auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
// CFRelease(url);
// auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
// auto res = std::string(cstring) + "/models";
// CFRelease(string);
// iosBenchAll(res.c_str());
// }
//#endif
auto bundle = CFBundleGetMainBundle();
auto url = CFBundleCopyBundleURL(bundle);
auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
CFRelease(url);
auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
auto res = std::string(cstring) + "/models/mobilenet_v2_auth.mnn";
MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str());
MNN::ScheduleConfig config;
interpreter->createSession(config);
return YES;
}

View File

@ -3,6 +3,7 @@
cmake_minimum_required(VERSION 3.4.1)
project(mnnpybridge)
# python_path / numpy_path / mnn_path
option(DEPEND_AAPL_FMWK "use dependency library .framework instead of traditional .a/.dylib" OFF)
option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
@ -12,8 +13,17 @@ option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OF
option(PYMNN_EXPR_API "MNN expr API be exposed" ON)
option(PYMNN_NUMPY_USABLE "Build based on numpy" ON)
option(PYMNN_TRAIN_API "MNN train API be exposed" OFF)
option(PYMNN_INTERNAL_SERVING "Internal use only." OFF)
if(PYMNN_INTERNAL_SERVING)
file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc
${CMAKE_CURRENT_LIST_DIR}/src/internal/monitor_service.cc
${CMAKE_CURRENT_LIST_DIR}/src/internal/verify_service.cc
${CMAKE_CURRENT_LIST_DIR}/src/internal/http_util.cc)
else()
file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
endif()
file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
if (MNN_BUILD_SHARED_LIBS)
add_library(mnnpybridge SHARED ${SRC})
else()
@ -39,6 +49,11 @@ if(PYMNN_TRAIN_API)
target_compile_definitions(mnnpybridge PRIVATE PYMNN_TRAIN_API)
endif()
if(PYMNN_INTERNAL_SERVING)
message(STATUS "mnnpybridge define PYMNN_INTERNAL_SERVING")
target_compile_definitions(mnnpybridge PRIVATE PYMNN_INTERNAL_SERVING)
endif()
if(CMAKE_SYSTEM_NAME MATCHES "^Android")
add_definitions(-DMNN_USE_LOGCAT)
endif()
@ -59,8 +74,8 @@ if(MSVC)
endif()
endif ()
endforeach()
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -73,20 +88,24 @@ endif()
if(PYMNN_TRAIN_API)
set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..)
target_include_directories(mnnpybridge PRIVATE
${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer
${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include)
${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer ${MNN_DIR}/tools/train/source/nn
${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include ${MNN_DIR}/tools/cv/include
${MNN_DIR}/express ${MNN_DIR}/express/module ${MNN_DIR}/tools)
endif()
if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party")
set(LIB_SUBPATH "")
if(WIN32)
if(NOT MNN_BUILD_SHARED_LIBS)
set(LIB_SUBPATH "Static")
elseif(MNN_WIN_RUNTIME_MT)
set(LIB_SUBPATH "MT")
if (MNN_BUILD_SHARED_LIBS)
set(LIB_SUBPATH "Dynamic")
else()
set(LIB_SUBPATH "MD")
set(LIB_SUBPATH "Static")
endif()
if (MNN_WIN_RUNTIME_MT)
set(LIB_SUBPATH "${LIB_SUBPATH}/MT")
else()
set(LIB_SUBPATH "${LIB_SUBPATH}/MD")
endif()
elseif(APPLE)
if(MNN_BUILD_SHARED_LIBS)
@ -108,34 +127,23 @@ if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
endif()
endif()
target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include)
target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH})
if(APPLE AND DEPEND_AAPL_FMWK)
target_link_libraries(mnnpybridge PRIVATE "-framework MNN")
set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}")
else()
target_link_libraries(mnnpybridge PRIVATE MNN)
find_library(MNN NAMES MNN REQUIRED PATHS ${mnn_path}/lib/${LIB_SUBPATH})
if(NOT DEPEND_AAPL_FMWK)
target_include_directories(mnnpybridge PUBLIC ${mnn_path}/include)
endif()
target_link_libraries(mnnpybridge PUBLIC ${MNN})
if(PYMNN_USE_ALINNPYTHON)
target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include)
target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH})
if(APPLE AND DEPEND_AAPL_FMWK)
target_link_libraries(mnnpybridge PRIVATE "-framework python")
set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}")
else()
target_link_libraries(mnnpybridge PRIVATE python)
endif()
find_library(python NAMES python REQUIRED PATHS ${python_path}/lib/${LIB_SUBPATH})
if(NOT DEPEND_AAPL_FMWK)
target_include_directories(mnnpybridge PUBLIC ${python_path}/include)
endif()
target_link_libraries(mnnpybridge PUBLIC ${python})
if(PYMNN_NUMPY_USABLE)
target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include)
target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH})
if(APPLE AND DEPEND_AAPL_FMWK)
target_link_libraries(mnnpybridge PRIVATE "-framework numpy_python")
set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}")
else()
target_link_libraries(mnnpybridge PRIVATE numpy_python)
find_library(numpy NAMES numpy_python REQUIRED PATHS ${numpy_path}/lib/${LIB_SUBPATH})
if(NOT DEPEND_AAPL_FMWK)
target_include_directories(mnnpybridge PUBLIC ${numpy_path}/include)
endif()
target_link_libraries(mnnpybridge PUBLIC ${numpy})
endif()
else()
target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include)

View File

@ -13,7 +13,7 @@ def inference():
config['precision'] = 'low'
session = interpreter.createSession()
input_tensor = interpreter.getSessionInput(session)
image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
image = cv2.imread('0000.jpg')
#cv2 read as bgr format
image = image[..., ::-1]
#change to rgb format

View File

@ -1,11 +1,22 @@
from _mnncengine.cv import *
import _mnncengine.cv as _F
import MNN.numpy as _np
import MNN
def __to_int(x):
dtype = x.dtype
if dtype == _np.int32:
return x
return x.astype(_np.int32)
def resize(src, dsize=None, fx=None, fy=None, interpolation=INTER_LINEAR, code = None, mean=[], norm=[]):
if dsize is None and fx is None and fy is None:
raise ValueError('reisze must set dsize or fx,fy.')
if dsize is None: dsize = [0, 0]
if fx is None: fx = 0
if fy is None: fy = 0
if code is None: code = -1
else: code = hash(code)
return _F.resize(src, dsize, fx, fy, interpolation, code, mean, norm)
def copyTo(src, mask=None, dst=None):
if mask is None: return src.copy()
origin_dtype = src.dtype
@ -45,3 +56,33 @@ def hconcat(src):
return _np.concatenate(src, 1)
def vconcat(src):
return _np.concatenate(src, 0)
def mean(src, mask=None):
if mask is not None:
src = copyTo(src, mask)
res = _np.mean(src, [0, 1])
if res.ndim == 0: size = 0
else: size = res.shape[0]
if size < 4:
res = _np.pad(res, [0, 4 - size])
return res
def flip(src, flipCode):
h, w, c = src.shape
m = MNN.CVMatrix()
if flipCode < 0:
m.write([-1., 0., w-1., 0., -1., h-1.])
elif flipCode == 0:
m.write([1., 0., 0., 0., -1., h-1.])
else:
m.write([-1., 0., w-1., 0., 1., 0.])
return warpAffine(src, m, [w, h])
ROTATE_90_CLOCKWISE = 0
ROTATE_180 = 1
ROTATE_90_COUNTERCLOCKWISE = 2
def rotate(src, rotateMode):
if rotateMode == ROTATE_90_CLOCKWISE:
return flip(src.transpose([1, 0, 2]), 1)
if rotateMode == ROTATE_180:
return flip(src, -1)
if rotateMode == ROTATE_90_COUNTERCLOCKWISE:
return flip(src.transpose([1, 0, 2]), 0)
return src

View File

@ -9,23 +9,26 @@ import _mnncengine._expr as _F
_numpy_supported = False
try:
import numpy as np
_numpy_supported = True
_numpy_supported = (type(np.arange(10)) == np.ndarray)
except Exception:
print ("Numpy not found. Using MNN without numpy.")
def scalar(value, dtype=None):
if dtype == _F.int:
value = _Int(value)
elif dtype == _F.float:
value = _Float(value)
if dtype is not None:
if dtype == _F.int or dtype == _F.uint8:
value = _Int(value)
elif dtype == _F.float:
value = _Float(value)
return _F.const([value], [], _F.NCHW, dtype)
if type(value) == type(1):
res = _F.const([value], [], _F.NCHW, _F.int)
return res
return _F.const([value], [], _F.NCHW, _F.int)
elif type(value) == type(1.):
res = _F.const([value], [], _F.NCHW, _F.float)
return res
return _F.const([value], [], _F.NCHW, _F.float)
else:
raise NotImplementedError("not supported data type for creating scalar variable")
def _list_shape_type(object, shape=()):
if isinstance(object, _Sequence) and len(object) == 0:
return [0], _F.float
if not isinstance(object, _Sequence):
if type(object) in (type(1), type(1<<64)):
dst_type = _F.int
@ -54,6 +57,7 @@ def _can_broadcast(src_shape, dst_shape):
return True
def _match_dtype(x, y, dtype=None):
def type_val(x):
if x is None: return -1
if x == _F.double: return 4
if x == _F.float: return 3
if x == _F.int64: return 2
@ -76,15 +80,18 @@ def _to_var(x, dtype=None):
return scalar(x, dtype)
# 2. numpy
if _numpy_supported:
if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
if x.dtype.kind == 'i':
x = x.astype(np.int32)
x = _F.const(x, x.shape, dtype=_F.int)
elif x.dtype.kine == 'f':
x = x.astype(np.float32)
x = _F.const(x, x.shape, dtype=_F.float)
else:
raise ValueError('Just support i/f dtype numpy.')
try:
if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
if x.dtype.kind == 'i':
x = x.astype(np.int32)
x = _F.const(x, x.shape, dtype=_F.int)
elif x.dtype.kind == 'f':
x = x.astype(np.float32)
x = _F.const(x, x.shape, dtype=_F.float)
else:
raise ValueError('Just support i/f dtype numpy.')
except:
pass
# 3. Sequence
if isinstance(x, _Sequence) and x:
dst_shape, item_type = _list_shape_type(x)
@ -202,7 +209,7 @@ def floor(x):
>>> expr.floor([-5.1, 4.5])
var([-6., 4.])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.floor(x)
def round(x):
'''
@ -223,7 +230,7 @@ def round(x):
>>> expr.round([-5.1, 4.5])
var([-5., 5.])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.round(x)
def ceil(x):
'''
@ -243,7 +250,7 @@ def ceil(x):
>>> expr.ceil([-4.9, 4.5])
var([-4., 5.])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.ceil(x)
def square(x):
'''
@ -283,7 +290,7 @@ def sqrt(x):
>>> expr.sqrt([9., 4.5])
var([3., 2.1213202])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.sqrt(x)
def rsqrt(x):
'''
@ -303,7 +310,7 @@ def rsqrt(x):
>>> expr.rsqrt([9., 4.5])
var([0.33333334, 0.47140455])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.rsqrt(x)
def exp(x):
'''
@ -323,7 +330,7 @@ def exp(x):
>>> expr.exp([9., 4.5])
var([8102.449, 90.01698])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.exp(x)
def log(x):
'''
@ -343,7 +350,7 @@ def log(x):
>>> expr.log([9., 4.5])
var([2.1972246, 1.5040774])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.log(x)
def sin(x):
'''
@ -363,7 +370,7 @@ def sin(x):
>>> expr.sin([9., 4.5])
var([0.4121185, -0.9775301])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.sin(x)
def sinh(x):
'''
@ -384,7 +391,7 @@ def sinh(x):
>>> expr.sinh([9., 4.5])
var([4051.542, 45.00301])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.sinh(x)
def cos(x):
'''
@ -404,7 +411,7 @@ def cos(x):
>>> expr.cos([9., 4.5])
var([-0.91113025, -0.2107958])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.cos(x)
def cosh(x):
'''
@ -425,7 +432,7 @@ def cosh(x):
>>> expr.cosh([9., 4.5])
var([4051.542, 45.014122])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.cosh(x)
def tan(x):
'''
@ -445,7 +452,7 @@ def tan(x):
>>> expr.tan([9., 4.5])
var([-0.45231566, 4.637332])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.tan(x)
def tanh(x):
'''
@ -466,7 +473,7 @@ def tanh(x):
>>> expr.tanh([9., 4.5])
var([1., 0.9997533])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.tanh(x)
def asin(x):
'''
@ -487,7 +494,7 @@ def asin(x):
>>> expr.asin([9., 0.5])
var([nan, 0.5235988])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.asin(x)
def asinh(x):
'''
@ -508,7 +515,7 @@ def asinh(x):
>>> expr.asinh([9., 0.5])
var([2.893444, 0.4812118])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.asinh(x)
def acos(x):
'''
@ -529,7 +536,7 @@ def acos(x):
>>> expr.asin([9., 0.5])
var([nan, 1.0471975])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.acos(x)
def acosh(x):
'''
@ -550,7 +557,7 @@ def acosh(x):
>>> expr.acosh([9., 0.5])
var([2.887271, nan])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.acosh(x)
def atan(x):
'''
@ -571,7 +578,7 @@ def atan(x):
>>> expr.atan([9., 0.5])
var([1.4601392, 0.4636476])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.atan(x)
def atanh(x):
'''
@ -592,7 +599,7 @@ def atanh(x):
>>> expr.atanh([9., 0.5])
var([1.4601392, 0.4636476])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.atanh(x)
def reciprocal(x):
'''
@ -612,7 +619,7 @@ def reciprocal(x):
>>> expr.reciprocal([9., 0.5])
var([0.11111111, 2.])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.reciprocal(x)
def log1p(x):
'''
@ -632,7 +639,7 @@ def log1p(x):
>>> expr.log1p([9., 0.5])
var([2.3025851, 0.4054651])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.log1p(x)
def gelu(x):
'''
@ -652,7 +659,7 @@ def gelu(x):
>>> expr.gelu([9., 0.5])
var([9., 0.345714])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.gelu(x)
def sigmoid(x):
'''
@ -672,16 +679,16 @@ def sigmoid(x):
>>> expr.sigmoid([9., 0.5])
var([0.9998766, 0.62246716])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.sigmoid(x)
def erf(x):
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.erf(x)
def erfc(x):
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.erfc(x)
def erfinv(x):
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.erfinv(x)
def expm1(x):
'''
@ -701,7 +708,7 @@ def expm1(x):
>>> expr.expm1([9., 0.5])
var([8.1014492e+03, 6.4869785e-01])
'''
x = _to_var(x)
x = _to_var(x, _F.float)
return _F.expm1(x)
def add(x, y):
'''
@ -1479,8 +1486,8 @@ def matmul(a, b, transposeA=False, transposeB=False):
var([[0., 1.],
[0., 3.]], dtype=float32)
'''
a = _to_var(a, True)
b = _to_var(b, True)
a = _to_var(a, _F.float)
b = _to_var(b, _F.float)
return _F.matmul(a, b, transposeA, transposeB)
def normalize(x, acrossSpatial, channelShared, eps, scale):
'''
@ -3055,7 +3062,7 @@ def zeros_like(x):
Example:
-------
>>> expr.zeros_like([[1, 2], [3, 4]])
array([[0, 0],
var([[0, 0],
[0, 0]], dtype=int32)
'''
x = _to_var(x)
@ -3078,14 +3085,72 @@ def range(start, limit, delta):
Example:
-------
>>> expr.range(1.0, 7.0, 2.0)
array([1., 3., 5.], dtype=float32)
var([1., 3., 5.], dtype=float32)
'''
start = _to_var(start)
limit = _to_var(limit)
delta = _to_var(delta)
if limit.dtype != start.dtype or delta.dtype != start.dtype:
print(start, limit, delta)
raise RuntimeError("parameter start/limit/delta must use same data type, either all int or all float")
return _F.range(start, limit, delta)
def sort(x, axis=-1, arg=False, descend=False):
'''
sort(x, axis=-1, arg=False, descend=False)
Return the sorted array of ``x``.
Parameters
----------
x : var_like, input value.
axis : int, sort by axis.
arg : is ArgSort or not, default is False.
descend : is descend or not, default is False.
Returns
-------
sorted_res : Var.
Example:
-------
>>> expr.sort([[5, 0], [1, 3]])
var([[1, 0],
[5, 3]], dtype=int32)
'''
x = _to_var(x)
# sort will change the x
x = clone(x, True)
return _F.sort(x, axis, arg, descend)
def nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0):
'''
nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0)
Return the nms array of ``boxes``.
Parameters
----------
boxes : var_like, input value, shape must be [num, 4].
scores : var_like, input value, shape must be [num].
max_detections : int.
iou_threshold : float, default is 0.
score_threshold : float, default is float_min.
Returns
-------
nms_res : Var.
Example:
-------
>>> expr.nms([[1, 1, 4, 4], [0, 0, 3, 3], [5, 5, 7, 7]], [0.9, 0.5, 0.1], 3, 0.1)
var([0, 2], dtype=int32)
'''
boxes = _to_var(boxes, _F.float)
scores = _to_var(scores, _F.float)
max_detections = _to_int(max_detections)
iou_threshold = _to_float(iou_threshold)
score_threshold = _to_float(score_threshold)
res = _F.nms(boxes, scores, max_detections, iou_threshold, score_threshold)
idx = res >= 0
idx.fix_as_const()
if _F.reduce_any(idx).read_as_tuple()[0] == 0:
return _F.const([], [0], NCHW, _F.int)
return res[idx]
# TODO: detection_post_process
# wrapper for builtin functions end
# wrapper for builtin functions end

View File

@ -19,6 +19,16 @@ inf = float('inf')
# helper functions
def __not_impl(*args):
raise NotImplementedError('MNN.numpy not implemet this function now.')
def __get_arg(kargs, key, default=None):
if key in kargs: return kargs[key]
return default
def __get_shape(args):
if type(args) not in (tuple, list):
return [args]
elif len(args) == 1 and type(args[0]) in (tuple, list):
return args[0]
else:
return args
def __order_assert(order):
if order is not None and order not in 'CK':
raise RuntimeError("MNN.numpy just support order=\"C|K\"")
@ -89,6 +99,7 @@ def identity(n, dtype=float32):
return eye(n, dtype=dtype)
def full(shape, fill_value, dtype=None, order='C'):
__order_assert(order)
shape = __get_shape(shape)
return _F.fill(_F._to_var(shape), _F.scalar(fill_value, dtype))
def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
dst_dtype, dst_shape = __array_like_type(a, dtype, order, shape)
@ -165,10 +176,14 @@ def __arange_3(start, stop, step=1, dtype=None):
def __arange_1(stop, dtype=None):
return __arange_3(0, stop, 1, dtype)
def arange(*args, **kargs):
if 'dtype' in kargs: dtype=kargs['dtype']
else: dtype = None
if len(args) == 1:
dtype = __get_arg(kargs, 'dtype')
step = __get_arg(kargs, 'step')
stop = __get_arg(kargs, 'stop')
start = __get_arg(kargs, 'start')
if len(args) == 1 and stop is None and step is None:
return __arange_1(args[0], dtype)
if len(args) == 2 and step is not None:
return __arange_3(*args, step=step, dtype=dtype)
if len(args) == 4:
return __arange_3(*args)
return __arange_3(*args, dtype=dtype)
@ -189,7 +204,26 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
base = pow(stop / _F._Float(start), 1./ num)
start = math.log(start, base)
return logspace(start, _F._Float(num), num, endpoint, base, dtype, axis)
def meshgrid(xi, copy=True, sparse=False, indexing='xy'): __not_impl()
def meshgrid(*xi, **kwargs):
copy = __get_arg(kwargs, 'copy', True)
sparse = __get_arg(kwargs, 'sparse', False)
indexing = __get_arg(kwargs, 'indexing', 'xy')
ndim = len(xi)
if indexing not in ['xy', 'ij']:
raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.")
s0 = (1,) * ndim
output = [asanyarray(x).reshape(s0[:i] + (-1,) + s0[i + 1:]) for i, x in enumerate(xi)]
if indexing == 'xy' and ndim > 1:
# switch first and second axis
output[0] = swapaxes(output[0], 0, 1)
output[1] = swapaxes(output[1], 0, 1)
if not sparse:
# Return the full N-D matrix (not only the 1-D vector)
output = broadcast_arrays(*output)
if copy:
output = [x.copy() for x in output]
return output
# 4. Building matrices
def diag(v, k=0):__not_impl()
def diagflat(v, k=0):__not_impl()
@ -212,11 +246,11 @@ def copyto(dst, src, casting='same_kind', where=True):
def shape(a):
return tuple(a.shape)
# 2. Changing array shape
def reshape(a, newshape, order='C'):
__order_assert(order)
def reshape(a, *newshape):
newshape = __get_shape(newshape)
return _F.reshape(a, newshape)
def ravel(a, order='C'):
return reshape(a, [-1], order)
return reshape(a, [-1])
# 3. Transpose-like operations
def moveaxis(a, source, destination):
ndim = a.ndim
@ -431,7 +465,9 @@ right_shift = packbits = unpackbits = binary_repr = base_repr = __not_impl
# String operations [Not Impl]
# Indexing routines
# 1. Generating index arrays
def where(condition, x, y):
def where(condition, x=None, y=None):
if x is None and y is None:
return nonzero(condition)
return _F.select(condition, x, y)
def indices(dimensions, dtype=int32, sparse=False):__not_impl()
def ix_(*args):__not_impl()
@ -546,6 +582,7 @@ arccosh = _F.acosh
arctanh = _F.atanh
around = _F.round
round_ = _F.round
round = _F.round
rint = _F.round
fix = _F.round
floor = _F.floor
@ -685,9 +722,12 @@ def pad(array, pad_width, mode='constant'):
return _F.pad(array, pad_width, mode)
# Sorting, searching, and counting
# 1. Sorting
def sort(a, axis=- 1, kind=None, order=None):__not_impl()
def lexsort(keys, axis=-1):__not_impl()
def argsort(a, axis=-1, kind=None, order=None): __not_impl()
def sort(a, axis=- 1, kind=None, order=None):
return _F.sort(a, axis)
def lexsort(keys, axis=-1):
return sort(keys, axis)
def argsort(a, axis=-1, kind=None, order=None):
return _F.sort(a, axis, True)
def msort(a): return sort(a, axis=0)
def sort_complex(a): __not_impl()
def partition(a, kth, axis=- 1, kind='introselect', order=None): __not_impl()
@ -704,6 +744,7 @@ def argwhere(a):
mask = not_equal(a, _F.scalar(0, a.dtype))
return _F.where(mask)
def nonzero(a):
res = _F.where(a)
res = argwhere(a)
if a.ndim == 1:
return (ravel(res),)
@ -762,6 +803,13 @@ corrcoef = correlate = cov = __not_impl
histogram = histogram2d = histogramdd = bincount = histogram_bin_edges = digitize = __not_impl
# numpy ndarray functions
def __item(self, idx):
if type(idx) == type(1):
return ravel(self)[idx]
elif type(idx) == tuple:
return self[idx]
else:
raise ValueError('item arg must be int or tuple.')
__override_operator(_F.Var, "all", all)
__override_operator(_F.Var, "any", any)
__override_operator(_F.Var, "argmax", argmax)
@ -793,6 +841,7 @@ __override_operator(_F.Var, "sum", sum)
__override_operator(_F.Var, "swapaxes", swapaxes)
__override_operator(_F.Var, "transpose", transpose)
__override_operator(_F.Var, "var", var)
__override_operator(_F.Var, "item", __item)
from . import random
from . import linalg
from . import linalg

View File

@ -15,6 +15,10 @@ USE_TRT=False
if len(sys.argv) > 1 and sys.argv[1] == '-trt':
USE_TRT=True
IS_INTERNAL_BUILD = False
if os.path.isdir('../../schema/private'):
IS_INTERNAL_BUILD = True
def build_deps():
""" build depency """
root_dir = os.path.dirname(os.path.dirname(os.getcwd()))
@ -31,15 +35,16 @@ def build_deps():
elif IS_LINUX:
extra_opts = '-DMNN_TENSORRT=ON \
-DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
os.system('cmake ' + extra_opts +
'-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
-DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4')
else:
os.system('cmake -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
-DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON\
-DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
.. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
.. && make MNN MNNTrain MNNConvert -j4')
################################################################################
# Building dependent libraries
################################################################################

View File

@ -8,6 +8,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
help='build wheel for 32bit arch, only usable on windows')
parser.add_argument('--version', dest='version', type=str, required=True,
help='MNN dist version')
parser.add_argument('--serving', dest='serving', action='store_true', default=False,
help='build for internal serving, default False')
parser.add_argument('--env', dest='env', type=str, required=False,
help='build environment, e.g. :daily/pre/production')
args = parser.parse_args()
import os
@ -23,6 +27,8 @@ if __name__ == '__main__':
comm_args = '--version ' + args.version
if IS_LINUX:
comm_args += ' --plat-name=manylinux1_x86_64'
comm_args += ' --env ' + args.env if args.env else ''
comm_args += ' --serving' if args.serving else ''
if IS_WINDOWS:
os.putenv('DISTUTILS_USE_SDK', '1')
os.putenv('MSSdk', '1')

View File

@ -10,6 +10,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
help='build wheel for 32bit arch, only usable on windows')
parser.add_argument('--version', dest='version', type=str, required=True,
help='MNN dist version')
parser.add_argument('--serving', dest='serving', action='store_true', default=False,
help='build for internal serving, default False')
parser.add_argument('--env', dest='env', type=str, required=False,
help='build environment, e.g. :daily/pre/production')
args, unknown = parser.parse_known_args()
sys.argv = [sys.argv[0]] + unknown
@ -27,7 +31,7 @@ IS_WINDOWS = (platform.system() == 'Windows')
IS_DARWIN = (platform.system() == 'Darwin')
IS_LINUX = (platform.system() == 'Linux')
BUILD_DIR = 'pymnn_build'
BUILD_TYPE = 'RELEASE'
BUILD_TYPE = 'REL_WITH_DEB_INFO'
BUILD_ARCH = 'x64'
if args.x86:
BUILD_ARCH = ''
@ -42,10 +46,12 @@ def report(*args):
package_name = 'MNN'
USE_TRT=check_env_flag('USE_TRT')
IS_INTERNAL_BUILD = False
print ("USE_TRT ", USE_TRT)
if os.path.isdir('../../schema/private'):
IS_INTERNAL_BUILD = True
if USE_TRT:
print("Build Internal NNN with TRT")
package_name = 'MNN_Internal_TRT'
@ -81,16 +87,19 @@ def configure_extension_build():
# extra_link_args = ['/NODEFAULTLIB:LIBCMT.LIB']
# /MD links against DLL runtime
# and matches the flags set for protobuf and ONNX
# /Z7 turns on symbolic debugging information in .obj files
# /Zi turns on symbolic debugging information in separate .pdb (which is same as MNN.pdb)
# /EHa is about native C++ catch support for asynchronous
# structured exception handling (SEH)
# /DNOMINMAX removes builtin min/max functions
# /wdXXXX disables warning no. XXXX
extra_compile_args = ['/MT', '/Z7',
# Some macro (related with __VA_ARGS__) defined in pymnn/src/util.h can not be process correctly
# becase of MSVC bug, enable /experimental:preprocessor fix it (And Windows SDK >= 10.0.18362.1)
extra_compile_args = ['/MT', '/Zi',
'/EHa', '/DNOMINMAX',
'/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
'/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
'/wd4275']
'/wd4275', '/experimental:preprocessor']
extra_link_args = []
else:
extra_link_args = []
extra_compile_args = [
@ -115,7 +124,11 @@ def configure_extension_build():
]
if check_env_flag('WERROR'):
extra_compile_args.append('-Werror')
extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS']
extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS']
if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
extra_compile_args += ['-DPYMNN_INTERNAL_SERVING']
if args.env == 'daily':
extra_compile_args += ['-DPYMNN_INTERNAL_SERVING_DAILY']
root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd())))
engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API']
engine_libraries = []
@ -123,13 +136,21 @@ def configure_extension_build():
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
print(engine_library_dirs)
if USE_TRT:
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
engine_library_dirs += ['/usr/local/cuda/lib64/']
# Logging is enabled on Linux. Add the dependencies.
if IS_LINUX and IS_INTERNAL_BUILD:
engine_library_dirs += ['/usr/include/curl/']
print(engine_library_dirs)
engine_link_args = []
engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")]
if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "monitor_service.cc")]
engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "verify_service.cc")]
engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "http_util.cc")]
engine_include_dirs = [os.path.join(root_dir, "include")]
engine_include_dirs += [os.path.join(root_dir, "express")]
engine_include_dirs += [os.path.join(root_dir, "express", "module")]
@ -146,13 +167,19 @@ def configure_extension_build():
engine_include_dirs += [os.path.join(root_dir, "schema", "current")]
engine_include_dirs += [os.path.join(root_dir, "3rd_party",\
"flatbuffers", "include")]
if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
engine_include_dirs += [os.path.join(root_dir, "3rd_party", "rapidjson")]
# cv include
engine_include_dirs += [os.path.join(root_dir, "tools", "cv", "include")]
engine_include_dirs += [np.get_include()]
trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
engine_depend = ['-lMNN']
engine_depend = ['-lMNN', '-lMNNOpenCV']
# enable logging & model authentication on linux.
if IS_LINUX and IS_INTERNAL_BUILD:
engine_depend += ['-lcurl', '-lssl', '-lcrypto']
if USE_TRT:
engine_depend += trt_depend
@ -167,6 +194,9 @@ def configure_extension_build():
# Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
tools_library_dirs += ['/usr/local/cuda/lib64/']
if IS_LINUX and IS_INTERNAL_BUILD:
tools_library_dirs += ['/usr/include/curl/']
tools_link_args = []
tools_sources = [os.path.join(root_dir, "pymnn", "src", "MNNTools.cc")]
tools_sources += [os.path.join(root_dir, "tools", "quantization",\
@ -195,61 +225,67 @@ def configure_extension_build():
tools_include_dirs += [os.path.join(root_dir, "source")]
tools_include_dirs += [np.get_include()]
tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
# enable logging and model authentication on linux.
if IS_LINUX and IS_INTERNAL_BUILD:
tools_depend += ['-lcurl', '-lssl', '-lcrypto']
if USE_TRT:
tools_depend += trt_depend
engine_extra_link_args = []
tools_extra_link_args = []
if IS_DARWIN:
engine_extra_link_args += ['-Wl,-all_load']
engine_extra_link_args += engine_depend
engine_extra_link_args += ['-Wl,-noall_load']
engine_link_args += ['-Wl,-all_load']
engine_link_args += engine_depend
engine_link_args += ['-Wl,-noall_load']
if IS_LINUX:
engine_extra_link_args += ['-Wl,--whole-archive']
engine_extra_link_args += engine_depend
engine_extra_link_args += ['-fopenmp']
engine_extra_link_args += ['-Wl,--no-whole-archive']
engine_link_args += ['-Wl,--whole-archive']
engine_link_args += engine_depend
engine_link_args += ['-fopenmp']
engine_link_args += ['-Wl,--no-whole-archive']
if IS_WINDOWS:
engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
engine_link_args += ['/WHOLEARCHIVE:MNN.lib']
if IS_DARWIN:
tools_extra_link_args += ['-Wl,-all_load']
tools_extra_link_args += tools_depend
tools_extra_link_args += ['-Wl,-noall_load']
tools_link_args += ['-Wl,-all_load']
tools_link_args += tools_depend
tools_link_args += ['-Wl,-noall_load']
if IS_LINUX:
tools_extra_link_args += ['-Wl,--whole-archive']
tools_extra_link_args += tools_depend
tools_extra_link_args += ['-fopenmp']
tools_extra_link_args += ['-Wl,--no-whole-archive']
tools_extra_link_args += ['-lz']
tools_link_args += ['-Wl,--whole-archive']
tools_link_args += tools_depend
tools_link_args += ['-fopenmp']
tools_link_args += ['-Wl,--no-whole-archive']
tools_link_args += ['-lz']
if IS_WINDOWS:
tools_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
tools_extra_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
tools_link_args += ['/WHOLEARCHIVE:MNN.lib']
tools_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
tools_link_args += ['libprotobuf.lib'] # use wholearchive will cause lnk1241 (version.rc specified)
if BUILD_TYPE == 'DEBUG':
# Need pythonxx_d.lib, which seem not exist in miniconda ?
if IS_WINDOWS:
extra_link_args.append('/DEBUG:FULL')
extra_compile_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
extra_link_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
else:
extra_compile_args += ['-O0', '-g']
extra_link_args += ['-O0', '-g']
if BUILD_TYPE == 'REL_WITH_DEB_INFO':
if IS_WINDOWS:
extra_link_args.append('/DEBUG:FULL')
extra_compile_args += ['/DEBUG']
extra_link_args += ['/DEBUG', '/OPT:REF', '/OPT:ICF']
else:
extra_compile_args += ['-g']
extra_link_args += ['-g']
# compat with py39
def make_relative_rpath(path):
""" make rpath """
if IS_DARWIN:
return '-Wl,-rpath,@loader_path/' + path
return ['-Wl,-rpath,@loader_path/' + path]
elif IS_WINDOWS:
return ''
return []
else:
return '-Wl,-rpath,$ORIGIN/' + path
return ['-Wl,-rpath,$ORIGIN/' + path]
################################################################################
# Declare extensions and package
@ -263,8 +299,8 @@ def configure_extension_build():
extra_compile_args=engine_compile_args + extra_compile_args,\
include_dirs=engine_include_dirs,\
library_dirs=engine_library_dirs,\
extra_link_args=engine_extra_link_args + engine_link_args\
+ [make_relative_rpath('lib')])
extra_link_args=engine_link_args + extra_link_args\
+ make_relative_rpath('lib'))
extensions.append(engine)
tools = Extension("_tools",\
libraries=tools_libraries,\
@ -273,8 +309,8 @@ def configure_extension_build():
extra_compile_args=tools_compile_args + extra_compile_args,\
include_dirs=tools_include_dirs,\
library_dirs=tools_library_dirs,\
extra_link_args=tools_extra_link_args +tools_link_args\
+ [make_relative_rpath('lib')])
extra_link_args=tools_link_args + extra_link_args\
+ make_relative_rpath('lib'))
extensions.append(tools)
# These extensions are built by cmake and copied manually in build_extensions()
# inside the build_ext implementaiton

View File

@ -19,7 +19,9 @@ static int tls_key_2 = 0;
#include <MNN/expr/ExecutorScope.hpp>
#include <MNN/expr/Module.hpp>
using namespace MNN::Express;
#ifdef PYMNN_OPENCV_API
#include "cv/cv.hpp"
#endif
#endif // PYMNN_EXPR_API
#ifdef BUILD_OPTYPE
@ -64,6 +66,12 @@ using RegularizationMethod = ParameterOptimizer::RegularizationMethod;
#endif
#endif
#ifdef PYMNN_INTERNAL_SERVING
#include <MNN/AutoTime.hpp>
#include "internal/monitor_service.h"
#include "internal/verify_service.h"
#endif
struct MNN_TLSData {
PyObject *PyMNNHalideTypeInt = NULL;
PyObject *PyMNNHalideTypeInt64 = NULL;
@ -187,6 +195,10 @@ static PyObject* PyMNNInterpreter_new(struct _typeobject *type, PyObject *args,
static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObject *kwds);
static void PyMNNInterpreter_dealloc(PyMNNInterpreter *);
#ifdef PYMNN_INTERNAL_SERVING
static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args);
#endif
static PyMethodDef PyMNNInterpreter_methods[] = {
{"createRuntime", (PyCFunction)PyMNNInterpreter_createRuntime, METH_VARARGS | METH_STATIC, "create runtime"},
{"createSession", (PyCFunction)PyMNNInterpreter_createSession, METH_VARARGS, "create session"},
@ -205,6 +217,9 @@ static PyMethodDef PyMNNInterpreter_methods[] = {
{"cache", (PyCFunction)PyMNNInterpreter_cache, METH_VARARGS, "cache current net instance"},
{"removeCache", (PyCFunction)PyMNNInterpreter_removeCache, METH_VARARGS, "remove cache with given path"},
{"updateSessionToModel", (PyCFunction)PyMNNInterpreter_updateSessionToModel, METH_VARARGS, "updateSessionToModel"},
#ifdef PYMNN_INTERNAL_SERVING
{"createSessionWithToken", (PyCFunction)PyMNNInterpreter_createSessionWithToken, METH_VARARGS, "create session with token"},
#endif
{NULL} /* Sentinel */
};
@ -681,13 +696,7 @@ static PyObject* PyMNNInterpreter_createRuntime(PyObject* self, PyObject* args)
return res;
}
static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
PyObject* dict = NULL, *rtinfo_py = NULL;
if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
return NULL;
}
static PyObject* createSession(PyMNNInterpreter *self, PyObject* dict, PyObject *rtinfo_py) {
PyObject *f = importName("MNN", "Session");
if (!f || !PyCallable_Check(f)) {
PyErr_SetString(PyExc_Exception,
@ -715,10 +724,10 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
}
Session* s;
if (rtinfo_py == NULL) {
s = instance->interpreter->createSession(config.second.first);
s = self->interpreter->createSession(config.second.first);
} else {
auto runtimeinfo = *(RuntimeInfo*)PyCapsule_GetPointer(rtinfo_py, NULL);
s = instance->interpreter->createSession(config.second.first, runtimeinfo);
s = self->interpreter->createSession(config.second.first, runtimeinfo);
}
if (!s) {
PyErr_SetString(PyExc_Exception,
@ -727,11 +736,54 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
}
session->session = s;
session->modelPath = instance->modelPath;
session->modelPath = self->modelPath;
return (PyObject *)session;
}
static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
#ifdef PYMNN_INTERNAL_SERVING
PyErr_SetString(PyExc_Exception,
"PyMNNInterpreter_createSession: unsupported interface, should use createSessionWithToken.");
return NULL;
#endif
PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
PyObject* dict = NULL, *rtinfo_py = NULL;
if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
return NULL;
}
return createSession(instance, dict, rtinfo_py);
}
#ifdef PYMNN_INTERNAL_SERVING
static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args) {
PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
PyObject* dict = NULL, *rtinfo_py = NULL;
char *token = NULL;
char *scene = NULL;
char *app_key = NULL;
if (!PyArg_ParseTuple(args, "sss|OO", &token, &scene, &app_key, &dict, &rtinfo_py)) {
return NULL;
}
if (!token || !scene || !app_key) {
PyErr_SetString(PyExc_Exception,
"PyMNNInterpreter_createSessionWithToken: input invalid, token, scene or app_key is null.");
return NULL;
}
bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
if (!ret) {
PyErr_SetString(PyExc_Exception,
"PyMNNNN_load_module_from_file_with_token: check token failed, return null session.");
return NULL;
}
return createSession(instance, dict, rtinfo_py);
}
#endif
static PyObject* PyMNNInterpreter_resizeSession(PyMNNInterpreter *self, PyObject *args) {
PyMNNSession* session = NULL;
if (!PyArg_ParseTuple(args, "O", &session)) {
@ -826,12 +878,27 @@ static PyObject* PyMNNInterpreter_runSession(PyMNNInterpreter *self, PyObject *a
}
ErrorCode r = NO_ERROR;
Py_BEGIN_ALLOW_THREADS
#ifdef PYMNN_INTERNAL_SERVING
Timer timer;
r = self->interpreter->runSession(session->session);
float cost_time = (float)timer.durationInUs() / (float)1000;
MNN::Interpreter::SessionInfoCode info_type = MNN::Interpreter::BACKENDS;
int backendType[MNN_FORWARD_ALL];
self->interpreter->getSessionInfo(session->session, info_type, backendType);
std::string mBizCode = self->interpreter->bizCode() ? self->interpreter->bizCode() : "";
std::string mUuid = self->interpreter->uuid() ? self->interpreter->uuid() : "";
MonitorService::GetInstance().Track(cost_time, std::to_string(*backendType), "RUN_SESSION",
"PyMNNInterpreter_runSession", std::to_string(r), mBizCode, mUuid);
#else
r = self->interpreter->runSession(session->session);
#endif
Py_END_ALLOW_THREADS
return PyLong_FromLong(r);
}
static PyMNNTensor* getTensor() {
PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)&PyMNNTensorType, PyTuple_New(0), NULL);
PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNTensorType), PyTuple_New(0), NULL);
if (tensor) {
tensor->tensor = nullptr;
}
@ -1222,6 +1289,12 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec
return -1;
}
#ifdef PYMNN_INTERNAL_SERVING
// initialize MonitorService
MonitorService::GetInstance().Start();
VerifyService::GetInstance().Start();
#endif
return 0;
}
@ -1315,7 +1388,7 @@ static PyObject* PyMNNSession_removeCache(PyMNNSession *self, PyObject *args) {
/// MNN Tensor implementation
bool isTensor(PyObject* t) {
return PyObject_IsInstance(t, (PyObject*)&PyMNNTensorType);
return PyObject_IsInstance(t, (PyObject*)PyType_FindTLSType(&PyMNNTensorType));
}
Tensor* toTensor(PyObject* t) {
return ((PyMNNTensor*)t)->tensor;
@ -1337,17 +1410,32 @@ static void PyMNNTensor_dealloc(PyMNNTensor *self) {
static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
int argc = PyTuple_Size(args);
PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr;
long dimensionType;
PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr, *input_var = nullptr;
long dimensionType = -1;
bool parse_res = false;
switch (argc) {
case 0:
// just return, using in `PyMNNInterpreter_getSessionInputAll`;
return 0;
#ifdef PYMNN_EXPR_API
case 1:
parse_res = PyArg_ParseTuple(args, "O", &input_var)
&& isVar(input_var);
break;
case 2:
parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
&& (isTensor(input_tensor) || isVar(input_tensor));
if (isVar(input_tensor)) {
input_var = input_tensor;
input_tensor = nullptr;
}
break;
#else
case 2:
parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
&& isTensor(input_tensor);
break;
#endif
case 3:
parse_res = PyArg_ParseTuple(args, "OOl", &shape, &dataType, &dimensionType)
&& isInts(shape);
@ -1361,11 +1449,35 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
}
if (!parse_res) {
PyMNN_ERROR_LOG("Tensor init require args as belows:\n"
"\t1. (Tensor, DimensionType)\n"
"\t0. (Var)\n"
"\t1. (Tensor/Var, DimensionType)\n"
"\t2. ([int], DataType, DimensionType)\n"
"\t3. ([int], DataType, tuple/ndarray, DimensionType)\n");
return -1;
}
#ifdef PYMNN_EXPR_API
// 0. create Tensor by Var
if (input_var) {
auto var = toVar(input_var);
auto info = var->getInfo();
void* ptr = const_cast<void*>(var->readMap<void>());
Tensor::DimensionType type = Tensor::TENSORFLOW;
if (dimensionType < 0) {
if (info->order == NCHW) type = Tensor::CAFFE;
else if (info->order == NC4HW4) type = Tensor::CAFFE_C4;
} else {
type = static_cast<Tensor::DimensionType>(dimensionType);
}
Tensor *tensor = Tensor::create(info->dim, info->type, ptr, type);
if (!tensor) {
PyMNN_ERROR_LOG("PyMNNTensor_create: Tensor create failed");
return -1;
}
self->tensor = tensor;
self->owner = 2;
return 0;
}
#endif
// 1. create Tensor by Tensor
if (input_tensor) {
Tensor *tensor = new Tensor(toTensor(input_tensor), (Tensor::DimensionType)dimensionType, true);
@ -1809,8 +1921,12 @@ static PyObject* PyMNNCVImageProcess_convert(PyMNNCVImageProcess *self, PyObject
return NULL;
}
if (PyLong_Check(source)) {
ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(PyLong_AsLong(source)),
if (isInt(source)) {
auto ptr = PyLong_AsVoidPtr(source);
if (ptr == NULL) {
Py_RETURN_NONE;
}
ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(ptr),
iw, ih, stride,
((PyMNNTensor *)dest)->tensor);
return PyLong_FromLong(ret);
@ -1949,46 +2065,70 @@ static PyObject* PyMNNCVImageProcess_setPadding(PyMNNCVImageProcess *self, PyObj
/// MNN CVMatrix implementation
bool isMatrix(PyObject* obj) {
return PyObject_IsInstance(obj, (PyObject*)&PyMNNCVMatrixType);
return PyObject_IsInstance(obj, (PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType));
}
CV::Matrix toMatrix(PyObject* obj) {
return *(((PyMNNCVMatrix*)obj)->matrix);
}
PyObject* toPyObj(CV::Matrix m) {
PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)&PyMNNCVMatrixType, PyTuple_New(0), NULL);
PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType), PyTuple_New(0), NULL);
ret->matrix = new CV::Matrix();
*(ret->matrix) = m;
return (PyObject*)ret;
}
bool isSize(PyObject* obj) {
return (isInts(obj) && toInts(obj).size() == 2);
}
CV::Size toSize(PyObject* obj) {
auto vals = toInts(obj);
MNN_ASSERT(val.size() == 2);
return CV::Size(vals[0], vals[1]);
}
bool isPoint(PyObject* obj) {
return (isFloats(obj) && toFloats(obj).size() == 2);
return (isFloats(obj) && toFloats(obj).size() == 2) ||
(isInts(obj) && toInts(obj).size() == 2);
}
CV::Point toPoint(PyObject* obj) {
auto vals = toFloats(obj);
MNN_ASSERT(val.size() == 2);
CV::Point point;
point.set(vals[0], vals[1]);
if (isFloats(obj)) {
auto vals = toFloats(obj);
MNN_ASSERT(val.size() == 2);
point.set(vals[0], vals[1]);
} else if (isInts(obj)) {
auto vals = toInts(obj);
MNN_ASSERT(val.size() == 2);
point.set(vals[0], vals[1]);
}
return point;
}
bool isPoints(PyObject* obj) {
return (isFloats(obj) && toFloats(obj).size() % 2 == 0);
return (isFloats(obj) && toFloats(obj).size() % 2 == 0) ||
(isInts(obj) && toInts(obj).size() % 2 == 0) || isVar(obj);
}
std::vector<CV::Point> toPoints(PyObject* obj) {
auto vals = toFloats(obj);
MNN_ASSERT(val.size() % 2 == 0);
std::vector<CV::Point> points(vals.size() / 2);
for (int i = 0; i < points.size(); i++) {
points[i].set(vals[i*2], vals[i*2+1]);
if (isFloats(obj)) {
auto vals = toFloats(obj);
MNN_ASSERT(vals.size() % 2 == 0);
std::vector<CV::Point> points(vals.size() / 2);
for (int i = 0; i < points.size(); i++) {
points[i].set(vals[i*2], vals[i*2+1]);
}
return points;
}
return points;
if (isInts(obj)) {
auto vals = toInts(obj);
MNN_ASSERT(vals.size() % 2 == 0);
std::vector<CV::Point> points(vals.size() / 2);
for (int i = 0; i < points.size(); i++) {
points[i].set(vals[i*2], vals[i*2+1]);
}
return points;
}
if (isVar(obj)) {
auto vals = toVar(obj);
auto size = vals->getInfo()->size;
MNN_ASSERT(size % 2 == 0);
std::vector<CV::Point> points(size / 2);
auto ptr = vals->readMap<float>();
for (int i = 0; i < points.size(); i++) {
points[i].set(ptr[i*2], ptr[i*2+1]);
}
return points;
}
return {};
}
PyObject* toPyObj(std::vector<CV::Point> _points) {
std::vector<float> points(_points.size() * 2);
@ -2494,7 +2634,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
PyErr_SetString(PyExc_Exception, "initMNN.expr: PyType_Ready PyMNNVarType failed");
ERROR_RETURN
}
PyModule_AddObject(expr_module, "Var", (PyObject *)&PyMNNVarType);
PyModule_AddObject(expr_module, "Var", (PyObject *)PyType_FindTLSType(&PyMNNVarType));
// def enum
def_data_format(expr_module);
def_dtype(expr_module);
@ -2547,6 +2687,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
def_ThresholdTypes(cv_module);
def_RetrievalModes(cv_module);
def_ContourApproximationModes(cv_module);
def_LineTypes(cv_module);
// add methods of cv
constexpr int cv_method_num = sizeof(PyMNNCV_methods) / sizeof(PyMethodDef);
for (int i = 0; i < cv_method_num; i++) {
@ -2571,6 +2712,10 @@ void loadMNN() {
WeImport_AppendInittab(MOD_NAME, MOD_INIT_FUNC);
});
}
void* memoryToVar(const void* ptr, int h, int w, int c, int type) {
auto var = Express::_Const(ptr, {h, w, c}, NHWC, dtype2htype(static_cast<DType>(type)));
return reinterpret_cast<void*>(toPyObj(var));
}
static auto registerMNN = []() {
loadMNN();
return true;

View File

@ -17,4 +17,12 @@
#define PYMNN_PUBLIC
#endif // WIN32
extern "C" PYMNN_PUBLIC void loadMNN();
// memoryToVar's type define
#define TypeFloat 1
#define TypeDouble 2
#define TypeInt 3
#define TypeUint8 4
#define TypeInt8 6
#define TypeInt64 9
extern "C" PYMNN_PUBLIC void loadMNN();
extern "C" PYMNN_PUBLIC void* memoryToVar(void* ptr, int h, int w, int c, int type);

View File

@ -99,10 +99,22 @@ def_enum(ContourApproximationModes, CV::ContourApproximationModes,
CV::CHAIN_APPROX_TC89_L1, "CHAIN_APPROX_TC89_L1",
CV::CHAIN_APPROX_TC89_KCOS, "CHAIN_APPROX_TC89_KCOS"
)
def_enum(LineTypes, CV::LineTypes,
CV::FILLED, "FILLED",
CV::LINE_4, "LINE_4",
CV::LINE_8, "LINE_8",
CV::LINE_AA, "LINE_AA"
)
// helper functions
INTS default_size = {0, 0}, default_param = {};
bool isSize(PyObject* obj);
CV::Size toSize(PyObject* obj);
bool isSize(PyObject* obj) {
return (isInts(obj) && toInts(obj).size() == 2);
}
CV::Size toSize(PyObject* obj) {
auto vals = toInts(obj);
MNN_ASSERT(val.size() == 2);
return CV::Size(vals[0], vals[1]);
}
bool isPoint(PyObject* obj);
CV::Point toPoint(PyObject* obj);
bool isPoints(PyObject* obj);
@ -378,24 +390,28 @@ static PyObject* PyMNNCV_invertAffineTransform(PyObject *self, PyObject *args) {
}
PyMNN_ERROR("invertAffineTransform require args: (Matrix)");
}
std::vector<float> default_floats = {};
static PyObject* PyMNNCV_resize(PyObject *self, PyObject *args) {
PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR);
PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR),
*mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
float fx = 0, fy = 0;
if (PyArg_ParseTuple(args, "OO|ffO", &src, &dsize, &fx, &fy, &interpolation) &&
isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation)) {
return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation)));
int code = -1;
if (PyArg_ParseTuple(args, "OO|ffOiOO", &src, &dsize, &fx, &fy, &interpolation, &code, &mean, &norm) &&
isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation) && isFloats(mean) && isFloats(norm)) {
return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation), code, toFloats(mean), toFloats(norm)));
}
PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags)");
PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags, int, [float], [float])");
}
static PyObject* PyMNNCV_warpAffine(PyObject *self, PyObject *args) {
PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
int borderValue = 0;
if (PyArg_ParseTuple(args, "OOO|OOi", &src, &M, &dsize, &flag, &borderMode, &borderValue) &&
isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode)) {
PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT),
*mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
int borderValue = 0, code = -1;
if (PyArg_ParseTuple(args, "OOO|OOiiOO", &src, &M, &dsize, &flag, &borderMode, &borderValue, &code, &mean, &norm) &&
isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode) && isFloats(mean) && isFloats(norm)) {
return toPyObj(CV::warpAffine(toVar(src), toMatrix(M), toSize(dsize),
toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue));
toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue, code, toFloats(mean), toFloats(norm)));
}
PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int)");
PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int, int, [float], [float])");
}
static PyObject* PyMNNCV_warpPerspective(PyObject *self, PyObject *args) {
PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
@ -433,7 +449,7 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
auto contours = CV::findContours(toVar(image), toEnum<CV::RetrievalModes>(mode),
toEnum<CV::ContourApproximationModes>(method), toPoint(offset));
PyObject* obj = PyTuple_New(2);
PyTuple_SetItem(obj, 0, toPyObj<std::vector<CV::Point>, toPyObj>(contours));
PyTuple_SetItem(obj, 0, toPyObj<VARP, toPyObj>(contours));
PyTuple_SetItem(obj, 1, toPyObj("no hierarchy"));
return obj;
}
@ -442,24 +458,29 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
static PyObject* PyMNNCV_contourArea(PyObject *self, PyObject *args) {
PyObject *points;
int oriented = 0;
if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isPoints(points)) {
float area = CV::contourArea(toPoints(points), oriented);
return toPyObj(area);
if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isVar(points)) {
float res = CV::contourArea(toVar(points), oriented);
return toPyObj(res);
}
PyMNN_ERROR("contourArea require args: ([float], |bool)");
PyMNN_ERROR("contourArea require args: (Var, |bool)");
}
static PyObject* PyMNNCV_convexHull(PyObject *self, PyObject *args) {
PyObject *points;
int clockwise = 0, returnPoints = 1;
if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isPoints(points)) {
return toPyObj(CV::convexHull(toPoints(points), clockwise, returnPoints));
if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isVar(points)) {
auto res = CV::convexHull(toVar(points), clockwise, returnPoints);
if (returnPoints) {
int npoints = res.size() / 2;
return toPyObj(Express::_Const(res.data(), { npoints, 1, 2 }, NHWC, halide_type_of<int>()));
}
return toPyObj(res);
}
PyMNN_ERROR("convexHull require args: ([float], |bool, bool)");
PyMNN_ERROR("convexHull require args: (Var, |bool, bool)");
}
static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
PyObject *points;
if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
auto rect = CV::minAreaRect(toPoints(points));
if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
auto rect = CV::minAreaRect(toVar(points));
PyObject* center = PyTuple_New(2);
PyTuple_SetItem(center, 0, toPyObj(rect.center.x));
PyTuple_SetItem(center, 1, toPyObj(rect.center.y));
@ -472,16 +493,16 @@ static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
PyTuple_SetItem(obj, 2, toPyObj(rect.angle));
return obj;
}
PyMNN_ERROR("minAreaRect require args: ([float])");
PyMNN_ERROR("minAreaRect require args: (Var)");
}
static PyObject* PyMNNCV_boundingRect(PyObject *self, PyObject *args) {
PyObject *points;
if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
auto rect = CV::boundingRect(toPoints(points));
if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
auto rect = CV::boundingRect(toVar(points));
std::vector<int> res { rect.x, rect.y, rect.width, rect.height };
return toPyObj(res);
}
PyMNN_ERROR("boundingRect require args: ([float])");
PyMNN_ERROR("boundingRect require args: (Var)");
}
static PyObject* PyMNNCV_connectedComponentsWithStats(PyObject *self, PyObject *args) {
PyObject *image;
@ -518,17 +539,106 @@ static PyObject* PyMNNCV_boxPoints(PyObject *self, PyObject *args) {
error_:
PyMNN_ERROR("boxPoints require args: [(float, (float, float), (float, float))])");
}
// draw
static bool isColor(PyObject* obj) {
return (isInts(obj) && (toInts(obj).size() == 3 || toInts(obj).size() == 4));
}
CV::Scalar toColor(PyObject* obj) {
auto vals = toInts(obj);
if (vals.size() == 3) {
return CV::Scalar(vals[0], vals[1], vals[2]);
}
if (vals.size() == 4) {
return CV::Scalar(vals[0], vals[1], vals[2], vals[3]);
}
return CV::Scalar(255, 255, 255);
}
static PyObject* PyMNNCV_line(PyObject *self, PyObject *args) {
PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
int thickness = 1, shift = 0;
if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
&& isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
auto image = toVar(img);
CV::line(image, toPoint(pt1), toPoint(pt2), toColor(color),
thickness, toEnum<CV::LineTypes>(linetype), shift);
Py_RETURN_NONE;
}
PyMNN_ERROR("line require args: (Var, Point, Point, Color, |int, LineType, int)");
}
static PyObject* PyMNNCV_arrowedLine(PyObject *self, PyObject *args) {
PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
int thickness = 1, shift = 0;
float tipLength = 0.1;
if (PyArg_ParseTuple(args, "OOOO|iOif", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift, &tipLength)
&& isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
auto image = toVar(img);
CV::arrowedLine(image, toPoint(pt1), toPoint(pt2), toColor(color),
thickness, toEnum<CV::LineTypes>(linetype), shift, tipLength);
Py_RETURN_NONE;
}
PyMNN_ERROR("arrowedLine require args: (Var, Point, Point, Color, |int, LineType, int, float)");
}
static PyObject* PyMNNCV_circle(PyObject *self, PyObject *args) {
PyObject *img, *center, *color, *linetype = toPyObj(CV::LINE_8);
int radius, thickness = 1, shift = 0;
if (PyArg_ParseTuple(args, "OOiO|iOi", &img, &center, &radius, &color, &thickness, &linetype, &shift)
&& isVar(img) && isPoint(center) && isColor(color) && isLineTypes(linetype)) {
auto image = toVar(img);
CV::circle(image, toPoint(center), radius, toColor(color),
thickness, toEnum<CV::LineTypes>(linetype), shift);
Py_RETURN_NONE;
}
PyMNN_ERROR("circle require args: (Var, Point, int, Color, |int, LineType, int)");
}
static PyObject* PyMNNCV_rectangle(PyObject *self, PyObject *args) {
PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
int thickness = 1, shift = 0;
if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
&& isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
auto image = toVar(img);
CV::rectangle(image, toPoint(pt1), toPoint(pt2), toColor(color),
thickness, toEnum<CV::LineTypes>(linetype), shift);
Py_RETURN_NONE;
}
PyMNN_ERROR("rectangle require args: (Var, Point, Point, Color, |int, LineType, int)");
}
static PyObject* PyMNNCV_drawContours(PyObject *self, PyObject *args) {
PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8);
int contourIdx, thickness = 1;
if (PyArg_ParseTuple(args, "OOiO|iO", &img, &contours, &contourIdx, &color, &thickness, &linetype)
&& isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype)) {
auto image = toVar(img);
CV::drawContours(image, toVec<std::vector<CV::Point>, toPoints>(contours), contourIdx, toColor(color),
thickness, toEnum<CV::LineTypes>(linetype));
Py_RETURN_NONE;
}
PyMNN_ERROR("drawContours require args: (Var, [Points], int, Color, |int, LineType)");
}
static PyObject* PyMNNCV_fillPoly(PyObject *self, PyObject *args) {
PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8), *offset = toPyObj(std::vector<float>{0, 0});
int shift = 0;
if (PyArg_ParseTuple(args, "OOO|OiO", &img, &contours, &color, &linetype, &shift, &offset)
&& isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype) && isPoint(offset)) {
auto image = toVar(img);
CV::fillPoly(image, toVec<std::vector<CV::Point>, toPoints>(contours), toColor(color),
toEnum<CV::LineTypes>(linetype), shift, toPoint(offset));
Py_RETURN_NONE;
}
PyMNN_ERROR("fillPoly require args: (Var, [Points], Color, |LineType, int, Point)");
}
static PyMethodDef PyMNNCV_methods[] = {
register_methods(CV,
#ifdef PYMNN_IMGCODECS
register_methods(CV,
// imgcodecs
haveImageReader, "haveImageReader",
haveImageWriter, "haveImageWriter",
imdecode, "imdecode",
imencode, "imencode",
imread, "imread",
imwrite, "imwrite",
imwrite, "imwrite"
)
#endif
register_methods(CV,
// color
cvtColor, "cvtColor.",
cvtColorTwoPlane, "cvtColorTwoPlane.",
@ -569,6 +679,13 @@ static PyMethodDef PyMNNCV_methods[] = {
minAreaRect, "minAreaRect",
boundingRect, "boundingRect",
connectedComponentsWithStats, "connectedComponentsWithStats",
boxPoints, "boxPoints"
boxPoints, "boxPoints",
// draw
line, "line",
arrowedLine, "arrowedLine",
circle, "circle",
rectangle, "rectangle",
drawContours, "drawContours",
fillPoly, "fillPoly"
)
};

View File

@ -63,6 +63,7 @@ def_enum(PrecisionMode, PrecisionMode,
typedef struct {
PyObject_HEAD
VARP* var;
int iter_index;
} PyMNNVar;
static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
static void PyMNNVar_dealloc(PyMNNVar *self);
@ -137,6 +138,9 @@ static PyObject* PyMNNVar_negative(PyObject*);
static PyObject* PyMNNVar_absolute(PyObject*);
static Py_ssize_t PyMNNVar_length(PyObject*);
static PyObject* PyMNNVar_subscript(PyObject*, PyObject*);
static int PyMNNVar_ass_subscript(PyObject*, PyObject*, PyObject*);
static PyObject* PyMNNVar_iter(PyObject*);
static PyObject* PyMNNVar_iternext(PyObject*);
#if PY_MAJOR_VERSION >= 3
static PyNumberMethods PyMNNVar_as_number = {
PyMNNVar_add, /*nb_add*/
@ -220,9 +224,9 @@ static PyNumberMethods PyMNNVar_as_number = {
};
#endif
static PyMappingMethods PyMNNVar_as_mapping = {
PyMNNVar_length, /*mp_length*/
PyMNNVar_subscript, /*mp_subscript*/
0, /*mp_ass_subscript*/
PyMNNVar_length, /*mp_length*/
PyMNNVar_subscript, /*mp_subscript*/
PyMNNVar_ass_subscript, /*mp_ass_subscript*/
};
PyObject *PyMNNVar_richcompare(PyObject *self, PyObject *other, int op);
static PyTypeObject PyMNNVarType = {
@ -256,8 +260,8 @@ static PyTypeObject PyMNNVarType = {
0, /*tp_clear*/
&PyMNNVar_richcompare, /*tp_richcompare*/
0, /*tp_weaklistoffset*/
0, /*tp_iter*/
0, /*tp_iternext*/
&PyMNNVar_iter, /*tp_iter*/
&PyMNNVar_iternext, /*tp_iternext*/
PyMNNVar_methods, /*tp_methods*/
0, /*tp_members*/
PyMNNVar_getsetters, /*tp_getset*/
@ -272,7 +276,7 @@ static PyTypeObject PyMNNVarType = {
};
// helper functions
static PyMNNVar* getVar() {
PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)&PyMNNVarType, PyTuple_New(0), NULL);
PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNVarType), PyTuple_New(0), NULL);
var->var = new VARP;
return var;
}
@ -284,7 +288,7 @@ static PyObject* toPyObj(VARP var) {
static bool isVar(PyObject* var) {
return isInt(var) || isInts(var) ||
isFloat(var) || isFloats(var) ||
PyObject_IsInstance(var, (PyObject*)&PyMNNVarType);
Py_TYPE(var) == PyType_FindTLSType(&PyMNNVarType);
}
static bool isVars(PyObject* var) {
return isVec<isVar>(var);
@ -353,21 +357,30 @@ std::pair<VARP, VARP> toVarPair(PyObject* l, PyObject* r, bool fp = false) {
PyObject *PyMNNVar_richcompare(PyObject *l, PyObject *r, int op) {
auto lr = toVarPair(l, r);
auto vl = lr.first, vr = lr.second;
VARP res;
switch (op) {
case Py_LT:
return toPyObj(Express::_Less(vl, vr));
res = Express::_Less(vl, vr);
break;
case Py_LE:
return toPyObj(Express::_LessEqual(vl, vr));
res = Express::_LessEqual(vl, vr);
break;
case Py_EQ:
return toPyObj(Express::_Equal(vl, vr));
res = Express::_Equal(vl, vr);
break;
case Py_NE:
return toPyObj(Express::_NotEqual(vl, vr));
res = Express::_NotEqual(vl, vr);
break;
case Py_GT:
return toPyObj(Express::_Greater(vl, vr));
res = Express::_Greater(vl, vr);
break;
case Py_GE:
return toPyObj(Express::_GreaterEqual(vl, vr));
res = Express::_GreaterEqual(vl, vr);
break;
default:
Py_RETURN_NONE;
}
Py_RETURN_NONE;
return toPyObj(res);
}
static PyObject* PyMNNVar_add(PyObject* l, PyObject* r) {
auto lr = toVarPair(l, r);
@ -413,11 +426,10 @@ static Py_ssize_t PyMNNVar_length(PyObject* x) {
}
return size;
}
static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
std::vector<int> begin, end, strides;
int new_axis_mask = 0, shrink_axis_mask = 0,
begin_mask = 0, end_mask = 0,
ellipsis_mask = 0, index = 0;
static void dealSlice(PyObject* slice, std::vector<int>& begin, std::vector<int>& end, std::vector<int>& strides,
int& new_axis_mask, int& shrink_axis_mask, int& begin_mask, int& end_mask, int& ellipsis_mask) {
int index = 0;
auto dealItem = [&](PyObject* item) {
if (PySlice_Check(item)) {
Py_ssize_t startl = 0, stopl = 0, stepl = 1;
@ -437,7 +449,7 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
if ((step == 1 && start == 0) || (step == -1 && start == -1)) {
begin_mask |= (1 << index);
}
if ((step == 1 && stop == -1) || (step == -1 && stop == 0)) {
if ((step == 1 && stop == -1) || (step == -1 && stop == 0) || PY_SSIZE_T_MAX == stopl) {
end_mask |= (1 << index);
}
}
@ -471,16 +483,136 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
} else {
dealItem(slice);
}
}
static inline bool isIdx(PyObject* slice) {
return Py_TYPE(slice) == PyType_FindTLSType(&PyMNNVarType) || (PyList_Check(slice) && isInts(slice));
}
static bool isBoolIdx(VARP idx, int reqSize) {
auto size = idx->getInfo()->size;
bool isbool = (size == reqSize);
if (isbool) {
auto ptr = idx->readMap<int>();
for (int i = 0; i < size; i++) {
if (ptr[i] != 0 && ptr[i] != 1) {
return false;
}
}
}
return isbool;
}
static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
// gather: 1. 0-1 gather; 2. idx gather;
if (isIdx(slice)) {
auto val = toVar(x);
auto idx = toVar(slice);
if (val->getInfo()->size > 1 && isBoolIdx(idx, val->getInfo()->size)) {
// 0-1 gather -> idx gather
idx = Express::_Where(idx);
val = Express::_GatherND(val, idx);
val = Express::_Reshape(val, {-1});
return toPyObj(val);
}
auto r = Express::_Gather(val, idx);
r->readMap<void>();
return toPyObj(r);
}
std::vector<int> begin, end, strides;
int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
int size_ = static_cast<int>(begin.size());
auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
return toPyObj(Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
ellipsis_mask, new_axis_mask, shrink_axis_mask));
auto res = Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
ellipsis_mask, new_axis_mask, shrink_axis_mask);
auto info = res->getInfo();
if (!info) {
PyMNN_ERROR("subscript: unable to get variable info");
}
// to scalar
if (info->dim.empty()) {
auto dtype = info->type;
if (dtype == halide_type_of<float>()) {
return toPyObj(res->readMap<float>()[0]);
}
if (dtype == halide_type_of<int>()) {
return toPyObj(res->readMap<int>()[0]);
}
if (dtype == halide_type_of<uint8_t>()) {
return toPyObj(res->readMap<uint8_t>()[0]);
}
if (dtype == halide_type_of<double>()) {
return toPyObj((float)res->readMap<double>()[0]);
}
}
return toPyObj(res);
}
static int PyMNNVar_ass_subscript(PyObject* x, PyObject* slice, PyObject* y) {
if (!isVar(x) || !isVar(y)) {
PyMNN_ERROR_LOG("ass_subscript require args: (Var, int/Var, int/float/Var)");
return -1;
}
auto var = toVar(x);
auto val = toVar(y);
auto varInfo = var->getInfo();
if (isIdx(slice)) {
auto idx = toVar(slice);
if (isBoolIdx(idx, varInfo->size)) {
idx = Express::_Where(idx);
}
auto idxDim = idx->getInfo()->dim;
int scatterNum = idxDim[0], scatterDim = 1;
if (idxDim.size() < 2) {
idx = Express::_Unsqueeze(idx, {-1});
} else {
scatterDim = idxDim[1];
}
// val broadcast_to [scatterNum, (scatterDim < varDim.size() ? varDim[scatterDim:] : 1)]
auto varDim = varInfo->dim;
std::vector<int> valDim(1, scatterNum);
if (scatterDim >= varDim.size()) {
valDim.push_back(1);
} else {
for (int i = scatterDim; i < varDim.size(); i++) {
valDim.push_back(varDim[i]);
}
}
val = Express::_BroadcastTo(val, _Const(valDim.data(), {static_cast<int>(valDim.size())}, NCHW, halide_type_of<int32_t>()));
*(((PyMNNVar*)x)->var) = Express::_ScatterNd(idx, val, Express::_Shape(var), var);
return 0;
}
std::vector<int> begin, end, strides;
int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
int size_ = static_cast<int>(begin.size());
auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
*(((PyMNNVar*)x)->var) = Express::_StridedSliceWrite(var, begin_, end_, strides_, val, begin_mask, end_mask,
ellipsis_mask, new_axis_mask, shrink_axis_mask);
return 0;
}
static PyObject* PyMNNVar_iter(PyObject *self) {
auto var = toVar(self);
if (var->getInfo()->dim.empty()) {
PyMNN_ERROR("iteration over a 0-d array");
}
Py_INCREF(self);
return self;
}
static PyObject* PyMNNVar_iternext(PyObject *self) {
auto idx = ((PyMNNVar*)self)->iter_index++;
auto var = toVar(self);
auto conut = var->getInfo()->dim[0];
if (idx >= conut) return NULL;
return toPyObj(Express::_Gather(var, Express::_Scalar<int>(idx)));
}
// PyMNNVar basic functions impl
static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
PyMNNVar* self = (PyMNNVar *)type->tp_alloc(type, 0);
self->iter_index = 0;
self->var = nullptr;
return (PyObject*)self;
}
@ -505,7 +637,7 @@ static PyObject* PyMNNVar_getshape(PyMNNVar *self, void *closure) {
if (self->var) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("getshape: unable to get variable info");
}
shape = toPyObj(info->dim);
}
@ -524,7 +656,7 @@ static PyObject* PyMNNVar_getdata_format(PyMNNVar *self, void *closure) {
if (self->var) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("getdata_format: unable to get variable info");
}
return toPyObj(info->order);
}
@ -534,7 +666,7 @@ static PyObject* PyMNNVar_getdtype(PyMNNVar *self, void *closure) {
if (self->var) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("getdtype: unable to get variable info");
}
return toPyObj(htype2dtype(info->type));
}
@ -544,7 +676,7 @@ static PyObject* PyMNNVar_getsize(PyMNNVar *self, void *closure) {
if (self->var) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("getsize: unable to get variable info");
}
return toPyObj(info->size);
}
@ -564,7 +696,7 @@ PyObject *ndim = NULL;
if (self->var) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("getndim: unable to get variable info");
}
ndim = toPyObj((int)info->dim.size());
}
@ -685,13 +817,16 @@ static PyObject* PyMNNVar_resize(PyMNNVar *self, PyObject *args) {
static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("read: unable to get variable info");
}
auto dtype = htype2dtype(info->type);
auto shape = info->dim;
int64_t total_length = info->size;
auto readptr = [self](DType dtype, INTS shape, int64_t total_length) {
void *dataPtr = (void *) (*(self->var))->readMap<void>();
if (nullptr == dataPtr) {
PyMNN_ERROR("call to readMap meet a error");
}
std::vector<npy_intp> npy_dims;
for(const auto dim : shape) {
npy_dims.push_back(dim);
@ -710,9 +845,6 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
default:
PyMNN_ERROR("does not support this dtype");
}
if (nullptr == dataPtr) {
PyMNN_ERROR("call to readMap meet a error");
}
};
auto data = readptr(dtype, shape, total_length);
(*(self->var))->unMap();
@ -722,13 +854,16 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
static PyObject* PyMNNVar_read_as_tuple(PyMNNVar *self, PyObject *args) {
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("read_as_tuple: unable to get variable info");
}
auto dtype = htype2dtype(info->type);
auto shape = info->dim;
size_t total_length = info->size;
auto readptr = [self](DType dtype, INTS shape, size_t total_length) {
void *dataPtr = (void *) (*(self->var))->readMap<void>();
if (nullptr == dataPtr) {
PyMNN_ERROR("call to readMap meet a error");
}
auto obj = PyTuple_New(total_length);
if(DType_FLOAT == dtype) {
auto data = (float*)dataPtr;
@ -766,7 +901,7 @@ static PyObject* PyMNNVar_write(PyMNNVar *self, PyObject *args) {
}
auto info = (*(self->var))->getInfo();
if(nullptr == info) {
PyMNN_ERROR("unable to get variable info");
PyMNN_ERROR("write: unable to get variable info");
}
auto dtype = htype2dtype(info->type);
int64_t total_length = info->size;
@ -1042,11 +1177,15 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
total_length *= shape[i];
}
}
auto data = toPtr(value, dtype, total_length);
auto ret = getVar();
if(data) {
*(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
free(data);
if (total_length > 0) {
auto data = toPtr(value, dtype, total_length);
if(data) {
*(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
free(data);
}
} else {
*(ret->var) = _Const(nullptr, shape, data_format, dtype2htype(dtype));
}
return (PyObject *)ret;
}
@ -1332,6 +1471,32 @@ static PyObject* PyMNNExpr_randomuniform(PyObject *self, PyObject *args) {
}
PyMNN_ERROR("randomuniform require args: (Var, dtype, |float, float, int, int)");
}
static PyObject* PyMNNExpr_sort(PyObject *self, PyObject *args) {
PyObject *x;
int axis = -1, arg = 0, descend = 0, bykey = -1;
if (PyArg_ParseTuple(args, "O|iii", &x, &axis, &arg, &descend) && isVar(x)) {
return toPyObj(Express::_Sort(toVar(x), axis, arg, descend));
}
PyMNN_ERROR("sort require args: (Var, |int, bool, bool)");
}
static PyObject* PyMNNExpr_raster(PyObject *self, PyObject *args) {
PyObject *var, *region, *shape;
if (PyArg_ParseTuple(args, "OOO", &var, &region, &shape) &&
isVars(var) && isInts(region) && isInts(shape)) {
return toPyObj(Express::_Raster(toVars(var), toInts(region), toInts(shape)));
}
PyMNN_ERROR("raster require args: ([Var], [int], [int])");
}
static PyObject* PyMNNExpr_nms(PyObject *self, PyObject *args) {
PyObject *boxes, *scores;
int max_detections;
float iou_threshold = -1.0, score_threshold = -1.0;
if (PyArg_ParseTuple(args, "OOi|ff", &boxes, &scores, &max_detections, &iou_threshold, &score_threshold) &&
isVar(boxes) && isVar(scores)) {
return toPyObj(Express::_Nms(toVar(boxes), toVar(scores), max_detections, iou_threshold, score_threshold));
}
PyMNN_ERROR("nms require args: (Var, Var, |float, float)");
}
static PyObject* PyMNNExpr_detection_post_process(PyObject *self, PyObject *args) {
PyObject *encode_boxes, *class_predictions, *anchors, *centersize_encoding;
int num_classes, max_detections, max_class_per_detection, detections_per_class;
@ -1508,6 +1673,9 @@ static PyMethodDef PyMNNExpr_methods[] = {
zeros_like, "build zeros_like expr",
unstack, "build unstack expr",
range, "build range expr",
sort, "build sort expr",
raster, "build raster expr",
nms, "build nms expr",
detection_post_process, "build detection_post_process expr"
)
};

View File

@ -1,4 +1,10 @@
#include "util.h"
#ifdef PYMNN_INTERNAL_SERVING
#include <MNN/AutoTime.hpp>
#include <MNN/MNNForwardType.h>
#include "internal/monitor_service.h"
#include "internal/verify_service.h"
#endif
// NN Module Start
def_class_start(_Module, Module)
@ -19,6 +25,37 @@ def_class_methods(_Module,
_add_parameter, "add parameter"
)
def_class_end(_Module, Module)
static PyObject* load_module(PyObject *inputs, PyObject *outputs, PyObject *backend, PyObject *memory_mode,
PyObject *power_mode, PyObject *precision_mode, const char* file_name, int dynamic,
int shape_mutable, int rearrange, int thread_num) {
BackendConfig backend_config;
backend_config.memory = toEnum<MemoryMode>(memory_mode);
backend_config.power = toEnum<PowerMode>(power_mode);
backend_config.precision = toEnum<PrecisionMode>(precision_mode);
Module::BackendInfo backend_info;
backend_info.type = toEnum<MNNForwardType>(backend);
backend_info.config = &backend_config;
Module::Config config;
config.dynamic = dynamic;
config.shapeMutable = shape_mutable;
config.rearrange = rearrange;
config.backend = &backend_info;
auto converted_file_name = convertBytesEncodeIfNeed(file_name);
auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
if (m_ptr == nullptr) {
std::string mnn_errno = "load_module_from_file failed ";
mnn_errno = mnn_errno + std::string(file_name);
PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
}
return toPyObj(m_ptr);
}
static PyObject* PyMNN_Module_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
PyMNN_Module *self = (PyMNN_Module *)type->tp_alloc(type, 0);
self->ptr = Module::createEmpty({});
@ -50,10 +87,31 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) {
Py_RETURN_NONE;
}
if (isVars(input)) {
#ifdef PYMNN_INTERNAL_SERVING
int status = 0;
Timer timer;
auto vars = self->ptr->onForward(toVars(input));
if (vars.empty()) {
PyMNN_ERROR("module onForward occur error.");
status = -1;
}
(void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
return toPyObj<VARP, toPyObj>(vars);
#else
return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(input)));
#endif
}
if (isVar(input)) {
#ifdef PYMNN_INTERNAL_SERVING
int status = 0;
Timer timer;
auto var = self->ptr->forward(toVar(input));
(void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
return toPyObj(var);
#else
return toPyObj(self->ptr->forward(toVar(input)));
#endif
}
PyMNN_ERROR("PyMNN_Module_forward: args must be Var/[Var].");
}
@ -62,8 +120,22 @@ static PyObject* PyMNN_Module_onForward(PyMNN_Module *self, PyObject *args) {
if (!PyArg_ParseTuple(args, "O", &inputs)) {
Py_RETURN_NONE;
}
#ifdef PYMNN_INTERNAL_SERVING
int status = 0;
Timer timer;
auto vars = self->ptr->onForward(toVars(inputs));
if (vars.empty()) {
PyMNN_ERROR("module onForward occur error.");
status = -1;
}
(void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_onForward");
return toPyObj<VARP, toPyObj>(vars);
#else
return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(inputs)));
#endif
}
static PyObject* PyMNN_Module_set_name(PyMNN_Module *self, PyObject *args) {
const char* name;
if (!PyArg_ParseTuple(args, "s", &name)) {
@ -125,6 +197,11 @@ static PyObject* PyMNNNN_load_module(PyObject *self, PyObject *args) {
return toPyObj(m);
}
static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
#ifdef PYMNN_INTERNAL_SERVING
PyErr_SetString(PyExc_Exception,
"PyMNNNN_load_module_from_file: unsupported interface, should use load_module_from_file_with_token.");
return NULL;
#endif
PyObject *inputs, *outputs, *backend, *memory_mode, *power_mode, *precision_mode;
const char* file_name;
int dynamic, shape_mutable, rearrange;
@ -135,30 +212,54 @@ static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
printf("PyArg_ParseTuple Error\n");
return NULL;
}
BackendConfig backend_config;
backend_config.memory = toEnum<MemoryMode>(memory_mode);
backend_config.power = toEnum<PowerMode>(power_mode);
backend_config.precision = toEnum<PrecisionMode>(precision_mode);
Module::BackendInfo backend_info;
backend_info.type = toEnum<MNNForwardType>(backend);
backend_info.config = &backend_config;
Module::Config config;
config.dynamic = dynamic;
config.shapeMutable = shape_mutable;
config.rearrange = rearrange;
config.backend = &backend_info;
auto converted_file_name = convertBytesEncodeIfNeed(file_name);
auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
if (m_ptr == nullptr) {
std::string mnn_errno = "load_module_from_file failed ";
mnn_errno = mnn_errno + std::string(file_name);
PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
}
return toPyObj(m_ptr);
return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
shape_mutable, rearrange, thread_num);
}
#ifdef PYMNN_INTERNAL_SERVING
static PyObject* PyMNNNN_load_module_from_file_with_token(PyObject *self, PyObject *args) {
PyObject *inputs, *outputs;
const char* file_name;
PyObject *backend = toPyObj(MNN_FORWARD_CPU);
PyObject *memory_mode = toPyObj(MemoryMode::Memory_Normal);
PyObject *power_mode = toPyObj(PowerMode::Power_Normal);;
PyObject *precision_mode = toPyObj(PrecisionMode::Precision_Normal);;
int dynamic = 0;
int shape_mutable = 0;
int rearrange = 0;
char *token = NULL;
char *scene = NULL;
char *app_key = NULL;
int thread_num = 1;
if (!PyArg_ParseTuple(args, "OOssss|iiiOOOOi", &inputs, &outputs, &file_name, &token, &scene, &app_key, &dynamic,
&shape_mutable, &rearrange, &backend, &memory_mode, &power_mode, &precision_mode,
&thread_num)) {
printf("PyArg_ParseTuple Error\n");
return NULL;
}
if (!token || !scene || !app_key) {
PyErr_SetString(PyExc_Exception,
"PyMNNNN_load_module_from_file_with_token: input invalid, token, scene or app_key is null.");
return NULL;
}
MonitorService::GetInstance().Start();
VerifyService::GetInstance().Start();
bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
if (!ret) {
PyErr_SetString(PyExc_Exception,
"PyMNNNN_load_module_from_file_with_token: check token failed, return null module.");
return NULL;
}
return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
shape_mutable, rearrange, thread_num);
}
#endif
#ifdef PYMNN_TRAIN_API
static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
INTS default_1 = {1, 1}, default_0 = {0, 0};
@ -221,10 +322,18 @@ static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args) {
}
#endif
static PyMethodDef PyMNNNN_methods[] = {
#ifdef PYMNN_INTERNAL_SERVING
register_methods(NN,
load_module, "load_module([Var], [Var], bool)",
load_module_from_file_with_token, "load_module_from_file_with_token([string], [string], filename, bool, ...)",
load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
)
#else
register_methods(NN,
load_module, "load_module([Var], [Var], bool)",
load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
)
#endif
#ifdef PYMNN_TRAIN_API
register_methods(NN,
conv, "conv Module",
@ -234,4 +343,4 @@ static PyMethodDef PyMNNNN_methods[] = {
)
#endif
};
// NN Module End
// NN Module End

View File

@ -225,13 +225,16 @@ inline int getnpysize(int npy_type) {
return 4;
case NPY_DOUBLE:
return 8;
case NPY_INT:
return 4;
case NPY_INT64:
return 8;
case NPY_UINT8:
return 1;
default:
// NPY_INT(np.int) and NPY_INT32(np.int32) may be different enum on some platform
// use `if` instead of `switch case`(when NPY_INT is same as NPY_INT32, two same case value is not support)
if (npy_type == NPY_INT || npy_type == NPY_INT32) {
return 4;
}
PyMNN_ERROR_LOG("does not support this npy_type");
return 0;
}
@ -249,7 +252,7 @@ inline int getitemsize(int dtype, int npy_type) {
}
return 8;
case DType_INT32:
if(npy_type != NPY_INT) {
if(npy_type != NPY_INT && npy_type != NPY_INT32) {
PyMNN_ERROR_LOG("numpy type does not match");
}
return 4;
@ -383,7 +386,7 @@ static bool isVec(PyObject* obj) {
return Func(PyList_GetItem(obj, 0));
} else return true;
}
return false;
return Func(obj);
}
static inline bool isInts(PyObject* obj) {
return isInt(obj) || isVec<isInt>(obj);
@ -438,6 +441,7 @@ static vector<T> toVec(PyObject* obj) {
}
return values;
}
values.push_back(Func(obj));
return values;
}
static inline std::vector<int> toInts(PyObject* obj) {
@ -586,188 +590,185 @@ static void* toPtr(PyObject *obj, DType dtype, int64_t& total_length, void* data
// just support COND = 0 or 1
#define arg_if(COND, THEN, ELSE) arg_concat(arg_if_, COND)(THEN, ELSE)
#define expand_item_0(...)
#define expand_item_1(macro, context, key, value, ITEMS...) \
#define expand_item_1(macro, context, key, value, ...) \
macro(context, key, value)
#define expand_item_2(macro, context, key, value, ITEMS...) \
#define expand_item_2(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_1(macro, context, ITEMS)
#define expand_item_3(macro, context, key, value, ITEMS...) \
expand_item_1(macro, context, __VA_ARGS__)
#define expand_item_3(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_2(macro, context, ITEMS)
#define expand_item_4(macro, context, key, value, ITEMS...) \
expand_item_2(macro, context, __VA_ARGS__)
#define expand_item_4(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_3(macro, context, ITEMS)
#define expand_item_5(macro, context, key, value, ITEMS...) \
expand_item_3(macro, context, __VA_ARGS__)
#define expand_item_5(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_4(macro, context, ITEMS)
#define expand_item_6(macro, context, key, value, ITEMS...) \
expand_item_4(macro, context, __VA_ARGS__)
#define expand_item_6(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_5(macro, context, ITEMS)
#define expand_item_7(macro, context, key, value, ITEMS...) \
expand_item_5(macro, context, __VA_ARGS__)
#define expand_item_7(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_6(macro, context, ITEMS)
#define expand_item_8(macro, context, key, value, ITEMS...) \
expand_item_6(macro, context, __VA_ARGS__)
#define expand_item_8(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_7(macro, context, ITEMS)
#define expand_item_9(macro, context, key, value, ITEMS...) \
expand_item_7(macro, context, __VA_ARGS__)
#define expand_item_9(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_8(macro, context, ITEMS)
#define expand_item_10(macro, context, key, value, ITEMS...) \
expand_item_8(macro, context, __VA_ARGS__)
#define expand_item_10(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_9(macro, context, ITEMS)
#define expand_item_11(macro, context, key, value, ITEMS...) \
expand_item_9(macro, context, __VA_ARGS__)
#define expand_item_11(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_10(macro, context, ITEMS)
#define expand_item_12(macro, context, key, value, ITEMS...) \
expand_item_10(macro, context, __VA_ARGS__)
#define expand_item_12(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_11(macro, context, ITEMS)
#define expand_item_13(macro, context, key, value, ITEMS...) \
expand_item_11(macro, context, __VA_ARGS__)
#define expand_item_13(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_12(macro, context, ITEMS)
#define expand_item_14(macro, context, key, value, ITEMS...) \
expand_item_12(macro, context, __VA_ARGS__)
#define expand_item_14(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_13(macro, context, ITEMS)
#define expand_item_15(macro, context, key, value, ITEMS...) \
expand_item_13(macro, context, __VA_ARGS__)
#define expand_item_15(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_14(macro, context, ITEMS)
#define expand_item_16(macro, context, key, value, ITEMS...) \
expand_item_14(macro, context, __VA_ARGS__)
#define expand_item_16(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_15(macro, context, ITEMS)
#define expand_item_17(macro, context, key, value, ITEMS...) \
expand_item_15(macro, context, __VA_ARGS__)
#define expand_item_17(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_16(macro, context, ITEMS)
#define expand_item_18(macro, context, key, value, ITEMS...) \
expand_item_16(macro, context, __VA_ARGS__)
#define expand_item_18(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_17(macro, context, ITEMS)
#define expand_item_19(macro, context, key, value, ITEMS...) \
expand_item_17(macro, context, __VA_ARGS__)
#define expand_item_19(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_18(macro, context, ITEMS)
#define expand_item_20(macro, context, key, value, ITEMS...) \
expand_item_18(macro, context, __VA_ARGS__)
#define expand_item_20(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_19(macro, context, ITEMS)
#define expand_item_21(macro, context, key, value, ITEMS...) \
expand_item_19(macro, context, __VA_ARGS__)
#define expand_item_21(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_20(macro, context, ITEMS)
#define expand_item_22(macro, context, key, value, ITEMS...) \
expand_item_20(macro, context, __VA_ARGS__)
#define expand_item_22(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_21(macro, context, ITEMS)
#define expand_item_23(macro, context, key, value, ITEMS...) \
expand_item_21(macro, context, __VA_ARGS__)
#define expand_item_23(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_22(macro, context, ITEMS)
#define expand_item_24(macro, context, key, value, ITEMS...) \
expand_item_22(macro, context, __VA_ARGS__)
#define expand_item_24(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_23(macro, context, ITEMS)
#define expand_item_24(macro, context, key, value, ITEMS...) \
expand_item_23(macro, context, __VA_ARGS__)
#define expand_item_25(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_23(macro, context, ITEMS)
#define expand_item_25(macro, context, key, value, ITEMS...) \
expand_item_24(macro, context, __VA_ARGS__)
#define expand_item_26(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_24(macro, context, ITEMS)
#define expand_item_26(macro, context, key, value, ITEMS...) \
expand_item_25(macro, context, __VA_ARGS__)
#define expand_item_27(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_25(macro, context, ITEMS)
#define expand_item_27(macro, context, key, value, ITEMS...) \
expand_item_26(macro, context, __VA_ARGS__)
#define expand_item_28(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_26(macro, context, ITEMS)
#define expand_item_28(macro, context, key, value, ITEMS...) \
expand_item_27(macro, context, __VA_ARGS__)
#define expand_item_29(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_27(macro, context, ITEMS)
#define expand_item_29(macro, context, key, value, ITEMS...) \
expand_item_28(macro, context, __VA_ARGS__)
#define expand_item_30(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_28(macro, context, ITEMS)
#define expand_item_30(macro, context, key, value, ITEMS...) \
expand_item_29(macro, context, __VA_ARGS__)
#define expand_item_31(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_29(macro, context, ITEMS)
#define expand_item_31(macro, context, key, value, ITEMS...) \
expand_item_30(macro, context, __VA_ARGS__)
#define expand_item_32(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_30(macro, context, ITEMS)
#define expand_item_32(macro, context, key, value, ITEMS...) \
expand_item_31(macro, context, __VA_ARGS__)
#define expand_item_33(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_31(macro, context, ITEMS)
#define expand_item_33(macro, context, key, value, ITEMS...) \
expand_item_32(macro, context, __VA_ARGS__)
#define expand_item_34(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_32(macro, context, ITEMS)
#define expand_item_34(macro, context, key, value, ITEMS...) \
expand_item_33(macro, context, __VA_ARGS__)
#define expand_item_35(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_33(macro, context, ITEMS)
#define expand_item_35(macro, context, key, value, ITEMS...) \
expand_item_34(macro, context, __VA_ARGS__)
#define expand_item_36(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_34(macro, context, ITEMS)
#define expand_item_36(macro, context, key, value, ITEMS...) \
expand_item_35(macro, context, __VA_ARGS__)
#define expand_item_37(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_35(macro, context, ITEMS)
#define expand_item_37(macro, context, key, value, ITEMS...) \
expand_item_36(macro, context, __VA_ARGS__)
#define expand_item_38(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_36(macro, context, ITEMS)
#define expand_item_38(macro, context, key, value, ITEMS...) \
expand_item_37(macro, context, __VA_ARGS__)
#define expand_item_39(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_37(macro, context, ITEMS)
#define expand_item_39(macro, context, key, value, ITEMS...) \
expand_item_38(macro, context, __VA_ARGS__)
#define expand_item_40(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_38(macro, context, ITEMS)
#define expand_item_40(macro, context, key, value, ITEMS...) \
expand_item_39(macro, context, __VA_ARGS__)
#define expand_item_41(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_39(macro, context, ITEMS)
#define expand_item_41(macro, context, key, value, ITEMS...) \
expand_item_40(macro, context, __VA_ARGS__)
#define expand_item_42(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_40(macro, context, ITEMS)
#define expand_item_42(macro, context, key, value, ITEMS...) \
expand_item_41(macro, context, __VA_ARGS__)
#define expand_item_43(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_41(macro, context, ITEMS)
#define expand_item_43(macro, context, key, value, ITEMS...) \
expand_item_42(macro, context, __VA_ARGS__)
#define expand_item_44(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_42(macro, context, ITEMS)
#define expand_item_44(macro, context, key, value, ITEMS...) \
expand_item_43(macro, context, __VA_ARGS__)
#define expand_item_45(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_43(macro, context, ITEMS)
#define expand_item_45(macro, context, key, value, ITEMS...) \
expand_item_44(macro, context, __VA_ARGS__)
#define expand_item_46(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_44(macro, context, ITEMS)
#define expand_item_46(macro, context, key, value, ITEMS...) \
expand_item_45(macro, context, __VA_ARGS__)
#define expand_item_47(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_45(macro, context, ITEMS)
#define expand_item_47(macro, context, key, value, ITEMS...) \
expand_item_46(macro, context, __VA_ARGS__)
#define expand_item_48(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_46(macro, context, ITEMS)
#define expand_item_48(macro, context, key, value, ITEMS...) \
expand_item_47(macro, context, __VA_ARGS__)
#define expand_item_49(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_47(macro, context, ITEMS)
#define expand_item_49(macro, context, key, value, ITEMS...) \
expand_item_48(macro, context, __VA_ARGS__)
#define expand_item_50(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_48(macro, context, ITEMS)
#define expand_item_50(macro, context, key, value, ITEMS...) \
expand_item_49(macro, context, __VA_ARGS__)
#define expand_item_51(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_49(macro, context, ITEMS)
#define expand_item_51(macro, context, key, value, ITEMS...) \
expand_item_50(macro, context, __VA_ARGS__)
#define expand_item_52(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_50(macro, context, ITEMS)
#define expand_item_52(macro, context, key, value, ITEMS...) \
expand_item_51(macro, context, __VA_ARGS__)
#define expand_item_53(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_51(macro, context, ITEMS)
#define expand_item_53(macro, context, key, value, ITEMS...) \
expand_item_52(macro, context, __VA_ARGS__)
#define expand_item_54(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_52(macro, context, ITEMS)
#define expand_item_54(macro, context, key, value, ITEMS...) \
expand_item_53(macro, context, __VA_ARGS__)
#define expand_item_55(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_53(macro, context, ITEMS)
#define expand_item_55(macro, context, key, value, ITEMS...) \
expand_item_54(macro, context, __VA_ARGS__)
#define expand_item_56(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_54(macro, context, ITEMS)
#define expand_item_56(macro, context, key, value, ITEMS...) \
expand_item_55(macro, context, __VA_ARGS__)
#define expand_item_57(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_55(macro, context, ITEMS)
#define expand_item_57(macro, context, key, value, ITEMS...) \
expand_item_56(macro, context, __VA_ARGS__)
#define expand_item_58(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_56(macro, context, ITEMS)
#define expand_item_58(macro, context, key, value, ITEMS...) \
expand_item_57(macro, context, __VA_ARGS__)
#define expand_item_59(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_57(macro, context, ITEMS)
#define expand_item_59(macro, context, key, value, ITEMS...) \
expand_item_58(macro, context, __VA_ARGS__)
#define expand_item_60(macro, context, key, value, ...) \
macro(context, key, value) \
expand_item_58(macro, context, ITEMS)
#define expand_item_60(macro, context, key, value, ITEMS...) \
macro(context, key, value) \
expand_item_59(macro, context, ITEMS)
expand_item_59(macro, context, __VA_ARGS__)
#define expand_items(macro, context, ...) \
arg_concat(expand_item_, arg_half_size(__VA_ARGS__))(macro, context, __VA_ARGS__)
//------------------------ macro_utils end -------------------------
@ -790,18 +791,6 @@ static PyObject* PyEnum_new(struct _typeobject *type, PyObject *args, PyObject *
Py_hash_t PyEnum_hash(PyObject* x) {
return static_cast<Py_hash_t>(((PyMNNEnum*)x)->value);
}
PyObject *PyEnum_richcompare(PyObject *self, PyObject *other, int op) {
int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value;
switch (op) {
case Py_LT: return toPyObj(l < r);
case Py_LE: return toPyObj(l <= r);
case Py_EQ: return toPyObj(l == r);
case Py_NE: return toPyObj(l != r);
case Py_GT: return toPyObj(l > r);
case Py_GE: return toPyObj(l >= r);
}
Py_RETURN_NONE;
}
static PyObject* toPyEnum(PyObject* type, int val) {
auto args = PyTuple_New(1);
PyTuple_SetItem((PyObject*)args, 0, PyLong_FromLong((long)val));
@ -825,11 +814,11 @@ static T toEnum(PyObject* e) {
PyObject_SetAttrString(scope, value, toPyObj(key)); \
PyDict_SetItemString(dict, value, toPyObj(key));
#define def_enum_repr(NAME, ITEMS...) \
#define def_enum_repr(NAME, ...) \
static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \
std::string str = #NAME "."; \
std::map<int, const char*> items = { \
expand_items(declare_map_item, _, ITEMS) \
expand_items(declare_map_item, _, __VA_ARGS__) \
}; \
int key = ((PyMNNEnum*)self)->value; \
auto iter = items.find(key); \
@ -839,22 +828,23 @@ static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \
#define def_enum_to(NAME, TYPE) \
static PyObject* toPyObj(TYPE value) { \
return toPyEnum((PyObject*)&PyEnum_##NAME, static_cast<int>(value)); \
return toPyEnum((PyObject*)PyType_FindTLSType(&PyEnum_##NAME), static_cast<int>(value)); \
}
#define def_enum_register(NAME, ITEMS...) \
#define def_enum_register(NAME, ...) \
static void def_##NAME(PyObject *scope) { \
if (PyType_Ready(&PyEnum_##NAME) < 0) { \
if (PyType_Ready(PyType_FindTLSType(&PyEnum_##NAME)) < 0) { \
PyErr_SetString(PyExc_Exception, "init " #NAME ": PyType_Ready failed"); \
} \
PyObject* self = (PyObject *)&PyEnum_##NAME; \
PyObject* self = (PyObject *)PyType_FindTLSType(&PyEnum_##NAME); \
PyObject* dict = PyEnum_##NAME.tp_dict; \
PyModule_AddObject(scope, #NAME, self); \
expand_items(register_item, NAME, ITEMS) \
expand_items(register_item, NAME, __VA_ARGS__) \
}
#define def_enum(NAME, TYPE, ITEMS...) \
def_enum_repr(NAME, ITEMS) \
#define def_enum(NAME, TYPE, ...) \
def_enum_repr(NAME, __VA_ARGS__) \
PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op); \
static PyTypeObject PyEnum_##NAME = { \
PyVarObject_HEAD_INIT(NULL, 0) \
#NAME, /*tp_name*/\
@ -879,7 +869,7 @@ static PyTypeObject PyEnum_##NAME = { \
"PyMNNEnum", /*tp_doc*/\
0, /*tp_traverse*/\
0, /*tp_clear*/\
&PyEnum_richcompare, /*tp_richcompare*/\
&PyEnum_##NAME##richcompare, /*tp_richcompare*/\
0, /*tp_weaklistoffset*/\
0, /*tp_iter*/\
0, /*tp_iternext*/\
@ -895,9 +885,22 @@ static PyTypeObject PyEnum_##NAME = { \
0, /*tp_alloc*/\
PyEnum_new /*tp_new*/\
};\
static inline bool is##NAME(PyObject* obj) { return PyObject_IsInstance(obj, (PyObject*)&PyEnum_##NAME); } \
static inline bool is##NAME(PyObject* obj) { return Py_TYPE(obj) == PyType_FindTLSType(&PyEnum_##NAME); } \
PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op) { \
if (!is##NAME(other)) Py_RETURN_FALSE; \
int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value; \
switch (op) { \
case Py_LT: return toPyObj(l < r); \
case Py_LE: return toPyObj(l <= r); \
case Py_EQ: return toPyObj(l == r); \
case Py_NE: return toPyObj(l != r); \
case Py_GT: return toPyObj(l > r); \
case Py_GE: return toPyObj(l >= r); \
} \
Py_RETURN_FALSE; \
} \
def_enum_to(NAME, TYPE) \
def_enum_register(NAME, ITEMS)
def_enum_register(NAME, __VA_ARGS__)
// ------------------------ enum end --------------------------
// ------------------------ func start ------------------------
#define def_methods(MODULE, NAME) \
@ -996,10 +999,10 @@ static PyObject* PyMNN##SCOPE##_##NAME(PyObject *self, PyObject *args) { \
#define def_class_register(NAME) \
static void def_##NAME(PyObject *scope) { \
if (PyType_Ready(&PyMNN##NAME##Type) < 0) { \
if (PyType_Ready(PyType_FindTLSType(&PyMNN##NAME##Type)) < 0) { \
PyErr_SetString(PyExc_Exception, "init" #NAME ": PyType_Ready PyMNN" #NAME "Type failed"); \
} \
PyObject* self = (PyObject *)&PyMNN##NAME##Type; \
PyObject* self = (PyObject *)PyType_FindTLSType(&PyMNN##NAME##Type); \
PyModule_AddObject(scope, #NAME, self); \
}
@ -1071,7 +1074,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
};\
def_class_register(NAME) \
static PyMNN##NAME* get##NAME() { \
return (PyMNN##NAME *)PyObject_Call((PyObject*)&PyMNN##NAME##Type, PyTuple_New(0), NULL); \
return (PyMNN##NAME *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNN##NAME##Type), PyTuple_New(0), NULL); \
} \
static PyObject* toPyObj(TYPE* x) { \
auto ret = get##NAME(); \

View File

@ -1,3 +1,4 @@
# -*- coding: UTF-8 -*-
import os
import sys
import MNN
@ -10,7 +11,11 @@ def parseConfig(root_dir):
configName = os.path.join(root_dir, 'config.txt')
if not os.path.exists(configName):
return False
config = open(configName, 'rt')
try:
config = open(configName, 'rt', encoding='utf-8')
except:
import io
config = io.open(configName, 'rt', encoding='utf-8')
res = {}
res['model_name'] = os.path.join(root_dir, 'temp.bin')
for line in config.readlines():

View File

@ -465,6 +465,14 @@ class UnitTest(unittest.TestCase):
self.assertEqualVar(expr.range(start, limit, delta), np.arange(0.0, 2.0, 0.3))
def test_depth_to_space(self):
self.assertEqualVar(expr.depth_to_space(self.x, 2), torch.pixel_shuffle(self._x, 2))
def test_sort(self):
x = mp.array([5, -1, 2, 0])
x_ = np.array([5, -1, 2, 0])
self.assertEqualVar(expr.sort(x), np.sort(x_))
def test_raster(self):
x = mp.array([[1, 2], [3, 4]])
x_ = np.array([[1, 2], [3, 4]])
self.assertEqualVar(expr.raster([x], [0, 1, 1, 2, 0, 1, 2, 1, 1, 2, 2], [2, 2]), x_.transpose())
def test_detection_post_process(self):
pass
# test cv
@ -643,6 +651,40 @@ class UnitTest(unittest.TestCase):
x = cv.threshold(self.imgf, 50, 20, cv.THRESH_BINARY)
y = cv2.threshold(self.imgf_, 50, 20, cv2.THRESH_BINARY)[1]
self.assertEqualImg(x, y)
# draw
def test_Draw(self):
x = self.img.copy()
y = self.img_.copy()
# 1. arrowedLine
cv.arrowedLine(x, [10, 10], [40, 40], [255, 0, 0])
cv2.arrowedLine(y, [10, 10], [40, 40], [255, 0, 0])
# 2. line
cv.line(x, [20, 30], [50, 60], [0, 0, 255])
cv2.line(y, [20, 30], [50, 60], [0, 0, 255])
# 3. circle
cv.circle(x, [70, 70], 30, [0, 255, 0])
cv2.circle(y, [70, 70], 30, [0, 255, 0])
# 4. rectangle
cv.rectangle(x, [80, 80], [120, 120], [0, 0, 255])
cv2.rectangle(y, [80, 80], [120, 120], [0, 0, 255])
# get contours
y_ = cv2.cvtColor(y, cv2.COLOR_BGR2GRAY)
y_ = cv2.threshold(y_, 127, 255, cv2.THRESH_BINARY)[1]
c_, _ = cv2.findContours(y_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
c = []
for a in c_:
ps = []
for b in a:
ps.append(int(b[0,0]))
ps.append(int(b[0,1]))
c.append(ps)
# 5. fillPoly
cv.fillPoly(x, c, [255, 0, 0])
cv2.fillPoly(y, c_, [255, 0, 0])
# 6. drawContours
cv.drawContours(x, c, -1, [0, 0, 255])
cv2.drawContours(y, c_, -1, [0, 0, 255])
self.assertEqualImg(x, y)
# structural
def test_Structural(self):
x = mp.array([[0,0,0,0,0,0,0,0,0,0,0,0,0],
@ -661,17 +703,20 @@ class UnitTest(unittest.TestCase):
contours_, _ = cv2.findContours(x_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
contour = contours[0]
contour_ = contours_[0]
self.assertEqualPoints(contour, contour_)
self.assertEqualVar(contour, contour_)
self.assertEqual(cv.contourArea(contour), cv2.contourArea(contour_))
hull = cv.convexHull(contour)
hull_ = cv2.convexHull(contour_)
self.assertEqualPoints(hull, hull_)
if version_info.major < 3: hull_ = np.concatenate([hull_[-1::, :], hull_[:-1,:]])
self.assertEqualVar(hull, hull_)
rect = cv.minAreaRect(contour)
rect_ = cv2.minAreaRect(contour_)
self.assertEqual(rect, rect_)
points = cv.boxPoints(rect),
if version_info.major >= 3:
self.assertEqual(rect, rect_)
points = cv.boxPoints(rect)
points_ = cv2.boxPoints(rect_)
self.assertEqualPoints(points, points_)
if version_info.major >= 3:
self.assertEqualVar(points, points_)
self.assertEqual(tuple(cv.boundingRect(contour)), cv2.boundingRect(contour_))
ret, labels, statsv, centroids = cv.connectedComponentsWithStats(x)
ret_, labels_, statsv_, centroids_ = cv2.connectedComponentsWithStats(x_)
@ -689,6 +734,16 @@ class UnitTest(unittest.TestCase):
x = cv.hconcat([self.img, self.img])
y = cv2.hconcat([self.img_, self.img_])
self.assertEqualImg(x, y)
def test_rotate(self):
x = cv.rotate(self.img, cv.ROTATE_90_CLOCKWISE)
y = cv2.rotate(self.img_, cv2.ROTATE_90_CLOCKWISE)
self.assertEqualImg(x, y)
x = cv.rotate(self.img, cv.ROTATE_180)
y = cv2.rotate(self.img_, cv2.ROTATE_180)
self.assertEqualImg(x, y)
x = cv.rotate(self.img, cv.ROTATE_90_COUNTERCLOCKWISE)
y = cv2.rotate(self.img_, cv2.ROTATE_90_COUNTERCLOCKWISE)
self.assertEqualImg(x, y)
# numpy
def test_from_shape_or_value(self):
x = mp.zeros([2, 2])
@ -724,6 +779,9 @@ class UnitTest(unittest.TestCase):
self.assertEqualVar(mp.linspace(2.0, 3.0, num=5, endpoint=False), np.linspace(2.0, 3.0, num=5, endpoint=False))
self.assertEqualVar(mp.logspace(2.0, 3.0, num=4, endpoint=False), np.logspace(2.0, 3.0, num=4, endpoint=False))
self.assertEqualVar(mp.geomspace(1, 1000, num=4, endpoint=False), np.geomspace(1, 1000, num=4, endpoint=False))
x = mp.arange(-5, 5., 0.1)
y = np.arange(-5, 5., 0.1)
self.assertEqualVars(mp.meshgrid(x, x), np.meshgrid(y, y))
def test_changing_array_shape(self):
x = mp.zeros((3, 2))
x_ = np.zeros((3, 2))
@ -916,6 +974,11 @@ class UnitTest(unittest.TestCase):
self.assertEqualShape(mp.random.randn(2,3).shape, np.random.randn(2,3).shape)
self.assertEqualShape(mp.random.rand(3,2).shape, np.random.rand(3,2).shape)
self.assertEqualShape(mp.random.randint(0, 2, [2,3]).shape, np.random.randint(0, 2, [2,3]).shape)
def test_sorting(self):
x = mp.array([[1,0,3], [0,6,5]])
x_ = np.array([[1,0,3], [0,6,5]])
self.assertEqualVar(mp.sort(x), np.sort(x_))
self.assertEqualVar(mp.argsort(x), np.argsort(x_))
def test_searching_counting(self):
x = mp.array([[1,0,3], [0,6,5]])
x_ = np.array([[1,0,3], [0,6,5]])
@ -980,10 +1043,12 @@ class UnitTest(unittest.TestCase):
self.assertAlmostEqual(x.var(), x_.var())
self.assertEqualVar(x.var(0), x_.var(0))
self.assertEqual(len(x), len(x_))
self.assertEqual(x[0,1].read_as_tuple()[0], x_[0,1])
self.assertEqual(x[0,1], x_[0,1])
self.assertEqualVar(x[0], x_[0])
self.assertEqualVar(x[:], x_[:])
self.assertEqualVar(x[:1], x_[:1])
self.assertEqualVar(x[::-1], x_[::-1])
self.assertEqualVar(x[x > 2], x_[x_ > 2])
self.assertEqualVar(x[mp.array([1])], x_[np.array([1])])
if __name__ == '__main__':
unittest.main()

View File

@ -376,13 +376,15 @@ struct ImageProcessParamT : public flatbuffers::NativeTable {
int8_t paddingValue;
std::vector<int32_t> shape;
DataType outputType;
bool draw;
ImageProcessParamT()
: filterType(FilterType_NEAREST),
sourceFormat(ImageFormatType_RGBA),
destFormat(ImageFormatType_RGBA),
wrap(WrapType_CLAMP_TO_EDGE),
paddingValue(0),
outputType(DataType_DT_INVALID) {
outputType(DataType_DT_INVALID),
draw(false) {
}
};
@ -421,6 +423,9 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
DataType outputType() const {
return static_cast<DataType>(GetField<int32_t>(22, 0));
}
bool draw() const {
return GetField<uint8_t>(24, 0) != 0;
}
bool Verify(flatbuffers::Verifier &verifier) const {
return VerifyTableStart(verifier) &&
VerifyField<int8_t>(verifier, 4) &&
@ -437,6 +442,7 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
VerifyOffset(verifier, 20) &&
verifier.VerifyVector(shape()) &&
VerifyField<int32_t>(verifier, 22) &&
VerifyField<uint8_t>(verifier, 24) &&
verifier.EndTable();
}
ImageProcessParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -477,6 +483,9 @@ struct ImageProcessParamBuilder {
void add_outputType(DataType outputType) {
fbb_.AddElement<int32_t>(22, static_cast<int32_t>(outputType), 0);
}
void add_draw(bool draw) {
fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(draw), 0);
}
explicit ImageProcessParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
: fbb_(_fbb) {
start_ = fbb_.StartTable();
@ -500,7 +509,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
flatbuffers::Offset<flatbuffers::Vector<float>> transform = 0,
int8_t paddingValue = 0,
flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
DataType outputType = DataType_DT_INVALID) {
DataType outputType = DataType_DT_INVALID,
bool draw = false) {
ImageProcessParamBuilder builder_(_fbb);
builder_.add_outputType(outputType);
builder_.add_shape(shape);
@ -509,6 +519,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
builder_.add_mean(mean);
builder_.add_destFormat(destFormat);
builder_.add_sourceFormat(sourceFormat);
builder_.add_draw(draw);
builder_.add_paddingValue(paddingValue);
builder_.add_wrap(wrap);
builder_.add_filterType(filterType);
@ -597,6 +608,7 @@ inline void ImageProcessParam::UnPackTo(ImageProcessParamT *_o, const flatbuffer
{ auto _e = paddingValue(); _o->paddingValue = _e; };
{ auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } };
{ auto _e = outputType(); _o->outputType = _e; };
{ auto _e = draw(); _o->draw = _e; };
}
inline flatbuffers::Offset<ImageProcessParam> ImageProcessParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ImageProcessParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -617,6 +629,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
auto _paddingValue = _o->paddingValue;
auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
auto _outputType = _o->outputType;
auto _draw = _o->draw;
return MNN::CreateImageProcessParam(
_fbb,
_filterType,
@ -628,7 +641,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
_transform,
_paddingValue,
_shape,
_outputType);
_outputType,
_draw);
}
inline const flatbuffers::TypeTable *SampleModeTypeTable() {
@ -803,7 +817,8 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
{ flatbuffers::ET_FLOAT, 1, -1 },
{ flatbuffers::ET_CHAR, 0, -1 },
{ flatbuffers::ET_INT, 1, -1 },
{ flatbuffers::ET_INT, 0, 3 }
{ flatbuffers::ET_INT, 0, 3 },
{ flatbuffers::ET_BOOL, 0, -1 }
};
static const flatbuffers::TypeFunction type_refs[] = {
FilterTypeTypeTable,
@ -821,10 +836,11 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
"transform",
"paddingValue",
"shape",
"outputType"
"outputType",
"draw"
};
static const flatbuffers::TypeTable tt = {
flatbuffers::ST_TABLE, 10, type_codes, type_refs, nullptr, names
flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, names
};
return &tt;
}

View File

@ -62,4 +62,5 @@ table ImageProcessParam {
paddingValue:byte = 0;
shape:[int]; // shape: [N, C, H, W]
outputType:DataType;
draw:bool = false;
}

View File

@ -170,7 +170,7 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
mPrecisionMode = precision;
mCoreFunctions = MNNGetCoreFunctions();
mInt8CoreFunctions = MNNGetInt8CoreFunctions();
mCache = new CPUResizeCache(this);
mCache = new CPUResizeCache;
}
CPUBackend::~CPUBackend() {

View File

@ -87,6 +87,19 @@ BLITTER CPUImageProcess::choose(ImageFormatType source, ImageFormatType dest) {
return nullptr;
}
BLITTER CPUImageProcess::choose(int channelByteSize) {
switch (channelByteSize) {
case 4:
return MNNC4blitH;
case 3:
return MNNC3blitH;
case 1:
return MNNC1blitH;
default:
return nullptr;
}
}
SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool identity) {
if (identity) {
switch (format) {
@ -271,10 +284,21 @@ static std::pair<int, int> _computeClip(CV::Point* points, int iw, int ih, const
}
ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = inputs[0], output = outputs[0];
ih = input->height();
iw = input->width();
ic = input->channel();
auto input = inputs[0];
if (input->dimensions() == 3) {
ih = input->length(0);
iw = input->length(1);
ic = input->length(2);
} else {
ih = input->height();
iw = input->width();
ic = input->channel();
}
if (draw) {
blitter = choose(ic * inputs[0]->getType().bytes());
return NO_ERROR;
}
auto output = outputs[0];
oh = output->height();
ow = output->width();
oc = output->channel();
@ -321,15 +345,37 @@ ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const s
ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto source = inputs[0]->host<uint8_t>();
auto dest = outputs[0]->host<void>();
void* dest = nullptr;
CV::Point points[2];
int tileCount = UP_DIV(ow, CACHE_SIZE);
auto destBytes = dtype.bytes();
for (int dy = 0; dy < oh; ++dy) {
int tileCount = UP_DIV(ow, CACHE_SIZE);
const int* regions = nullptr;
if (draw) {
// change input to output
dest = source;
oh = inputs[1]->length(0);
ow = iw;
oc = ic;
destBytes = inputs[0]->getType().bytes();
// draw one
tileCount = 1;
// src is color
samplerDest = inputs[2]->host<uint8_t>();
// get region info ptr
regions = inputs[1]->host<int>();
} else {
dest = outputs[0]->host<void>();
}
for (int i = 0; i < oh; ++i) {
int dy = draw ? regions[3 * i] : i;
auto dstY = (uint8_t*)dest + dy * destBytes * ow * oc;
for (int tIndex = 0; tIndex < tileCount; ++tIndex) {
int xStart = tIndex * CACHE_SIZE;
int count = std::min(CACHE_SIZE, ow - xStart);
if (draw) {
xStart = regions[3 * i + 1];
count = regions[3 * i + 2] - xStart + 1;
}
auto dstStart = dstY + destBytes * oc * xStart;
if (!blitFloat) {
@ -340,7 +386,7 @@ ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const
}
// Sample
{
if (!draw) {
// Compute position
points[0].fX = xStart;
points[0].fY = dy;

View File

@ -23,6 +23,10 @@ typedef void (*SAMPLER)(const unsigned char* source, unsigned char* dest, CV::Po
class CPUImageProcess : public Execution {
public:
CPUImageProcess(CV::ImageProcess::Config config, const CoreFunctions* coreFunctions) : Execution(nullptr), coreFunctions(coreFunctions) {
if (config.draw) {
draw = true;
return;
}
filterType = (FilterType)config.filterType;
wrap = (WrapType)config.wrap;
sourceFormat = (ImageFormatType)config.sourceFormat;
@ -40,6 +44,11 @@ public:
paddingValue = val;
}
CPUImageProcess(Backend *bn, const ImageProcessParam* process) : Execution(bn) {
coreFunctions = static_cast<CPUBackend*>(backend())->functions();
draw = process->draw();
if (draw) {
return;
}
filterType = process->filterType();
wrap = process->wrap();
sourceFormat = process->sourceFormat();
@ -53,12 +62,12 @@ public:
transform.set(i, process->transform()->Get(i));
}
transform.invert(&transformInvert);
coreFunctions = static_cast<CPUBackend*>(backend())->functions();
}
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
BLITTER choose(ImageFormatType source, ImageFormatType dest);
BLITTER choose(int channelByteSize);
BLIT_FLOAT choose(ImageFormatType format, int dstBpp = 0);
SAMPLER choose(ImageFormatType format, FilterType type, bool identity);
private:
@ -78,6 +87,7 @@ private:
std::unique_ptr<uint8_t[]> samplerBuffer, blitBuffer;
uint8_t* samplerDest = nullptr, *blitDest = nullptr;
const CoreFunctions* coreFunctions = nullptr;
bool draw = false;
};
}; // namespace MNN

View File

@ -117,6 +117,9 @@ ErrorCode CPUNonMaxSuppressionV2::onExecute(const std::vector<Tensor*>& inputs,
const auto scores = inputs[1]->host<float>();
NonMaxSuppressionSingleClasssImpl(inputs[0], scores, maxDetections, iouThreshold, scoreThreshold, &selected);
std::copy_n(selected.begin(), selected.size(), outputs[0]->host<int32_t>());
for (int i = selected.size(); i < outputs[0]->elementSize(); i++) {
outputs[0]->host<int32_t>()[i] = -1;
}
return NO_ERROR;
}

View File

@ -6,11 +6,11 @@
#include "MNN_generated.h"
namespace MNN {
class CPUBackend;
class CPUResizeCache {
// FIXME: Move outside
class MNN_PUBLIC CPUResizeCache {
public:
CPUResizeCache(const CPUBackend* backend) {
mBackend = backend;
CPUResizeCache() {
// Do nothing
}
~ CPUResizeCache() {
// Do nothing
@ -21,7 +21,6 @@ public:
void reset();
private:
std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
const CPUBackend* mBackend;
};
}

View File

@ -45,7 +45,7 @@ void ScatterNdImpl(const Tensor* indices, const Tensor* updates, const Tensor* s
}
if (valid) {
for (int k = 0; k < accNumber; ++k) {
outputPtr[pos + k] += updatesPtr[i * accNumber + k];
outputPtr[pos + k] = updatesPtr[i * accNumber + k];
}
}
}
@ -59,7 +59,12 @@ ErrorCode CPUScatterNd::onExecute(const std::vector<Tensor*>& inputs, const std:
const int outputSize = output->size();
auto outputRawPtr = output->host<int8_t>();
memset(outputRawPtr, 0, outputSize);
if (inputs.size() < 4) {
memset(outputRawPtr, 0, outputSize);
} else {
auto inputRawPtr = inputs[3]->host<int8_t>();
memcpy(outputRawPtr, inputRawPtr, outputSize);
}
auto updatesDataType = updates->getType();
if (updatesDataType == halide_type_of<int32_t>()) {

View File

@ -1065,3 +1065,21 @@ void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN
auto countC2 = ((count + 1) / 2);
_swapUV(destUV, destUV, countC2);
}
void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count) {
for (int i = 0; i < count; i++) {
memcpy(dest + 3 * i, source, 3);
}
}
void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count) {
for (int i = 0; i < count; i++) {
memcpy(dest + 4 * i, source, 4);
}
}
void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count) {
for (int i = 0; i < count; i++) {
memcpy(dest + i, source, 1);
}
}

View File

@ -132,4 +132,8 @@ void MNNSamplerNV12Copy(const unsigned char* source, unsigned char* dest, MNN::C
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
// draw blit
void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count);
void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count);
void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count);
#endif /* ImageProcessFunction_hpp */

View File

@ -1,29 +1,72 @@
# Process asm file on Windows, then subsitute *.S by *.S.obj as source file of add_library
# If MNN_ASSEMBLER env var is not set, ignore *.S file, which may cause low performance
set(EXTRA_OBJS "")
IF(MSVC AND (DEFINED ENV{MNN_ASSEMBLER}) AND "${CMAKE_SIZEOF_VOID_P}" STREQUAL "8")
set(WIN_USE_ASM ON)
ENDIF()
message(STATUS "WIN_USE_ASM: ${WIN_USE_ASM}")
function (process_asm TARGET_NAME FILE_SRCS)
if(NOT MSVC)
return()
endif()
set(FILE_DESTS "")
foreach(SRC ${${FILE_SRCS}})
get_filename_component(SRC_EXT ${SRC} EXT)
if(NOT ${SRC_EXT} STREQUAL ".S")
list(APPEND FILE_DESTS ${SRC})
continue()
elseif(NOT WIN_USE_ASM)
continue()
endif()
string(REPLACE ${CMAKE_CURRENT_SOURCE_DIR} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TARGET_NAME}.dir" DEST ${SRC})
add_custom_command(
OUTPUT ${DEST}.obj
# *.S -> *.S.i: do preprocess(define/ifdef macro) by cl.exe
COMMAND "${CMAKE_C_COMPILER}" /DWIN32 /experimental:preprocessor /P /Fi"${DEST}.i" "${SRC}"
# *.S.i -> *.S.obj, use gnu assembler which support (AT&T syntax)
COMMAND "$ENV{MNN_ASSEMBLER}" -o "${DEST}.obj" "${DEST}.i"
)
list(APPEND EXTRA_OBJS ${DEST}.obj)
endforeach()
set(${FILE_SRCS} ${FILE_DESTS} PARENT_SCOPE)
set(EXTRA_OBJS ${EXTRA_OBJS} PARENT_SCOPE)
endfunction()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
message(STATUS "${CMAKE_SYSTEM_PROCESSOR}: Open SSE")
target_compile_options(MNNCPU PRIVATE -DMNN_USE_SSE)
option(MNN_AVX512_VNNI "Enable AVX512 VNNI" ON)
FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*)
if (MSVC)
FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp)
FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp)
else()
FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
message(STATUS "MNN_AVX512:${MNN_AVX512}")
if (MNN_AVX512)
FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
if (MNN_AVX512_VNNI)
target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni -DMNN_AVX512_VNNI)
FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
message(STATUS "MNN_AVX512:${MNN_AVX512}")
if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
process_asm(MNNAVX512 MNN_AVX512_SRC)
add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM)
if (MSVC)
target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
else()
target_compile_options(MNNAVX512 PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
endif()
if (MNN_AVX512_VNNI)
target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_AVX512_VNNI)
if (MSVC)
target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
else()
target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni)
endif()
endif()
endif()
FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*)
process_asm(MNNAVX MNN_AVX_SRC)
process_asm(MNNAVXFMA MNN_AVXFMA_SRC)
process_asm(MNNSSE MNN_SSE_SRC)
add_library(MNNX8664 OBJECT ${MNN_X8664_SRC})
add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
@ -34,7 +77,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE)
if(MSVC)
target_compile_options(MNNAVX PRIVATE /arch:AVX)
target_compile_options(MNNAVXFMA PRIVATE /arch:AVX)
target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2)
else()
target_compile_options(MNNSSE PRIVATE -msse4.1)
target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
@ -47,7 +90,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
endif()
endif()
list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
if (MNN_AVX512)
if (MSVC AND WIN_USE_ASM)
target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM)
target_compile_options(MNNAVXFMA PRIVATE -DMNN_X86_USE_ASM)
list(APPEND MNN_OBJECTS_TO_LINK ${EXTRA_OBJS})
endif()
if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
target_compile_options(MNNCPU PRIVATE -DMNN_AVX512)
target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512)
if (MNN_AVX512_VNNI)

View File

@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -41,6 +42,17 @@ movq %r9, %rcx
movq %r10, %r9
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
@ -304,6 +316,17 @@ addq $64, %rsp
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13

View File

@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1
// SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -41,6 +42,17 @@ movq %r9, %rcx
movq %r10, %r9
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
@ -190,6 +202,17 @@ addq $64, %rsp
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13

View File

@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1EFMA_ASM
// SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
// all callee save regs:
// %rbx, %rbp, %r12~%r15
// unused para regs: %r8, %r9
// can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %rax
pushq %rbx
pushq %r8
@ -42,7 +66,7 @@ pushq %r12
pushq %r13
pushq %r14
pushq %r15
#endif
movq (%rdi), %rax // %rax C
movq 8(%rdi), %rbx // %rbx A
@ -215,6 +239,27 @@ LoopE24H1:
jmp LoopE24H1
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
@ -223,6 +268,8 @@ popq %r9
popq %r8
popq %rbx
popq %rax
#endif
popq %rbp
retq

View File

@ -30,10 +30,33 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4EFMA_ASM
// %rbx, %rbp, %r12~%r15
// unused para regs: %r8, %r9
// can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %rax
pushq %rbx
pushq %r8
@ -42,6 +65,7 @@ pushq %r12
pushq %r13
pushq %r14
pushq %r15
#endif
movq (%rdi), %rax // %rax C
movq 8(%rdi), %rbx // %rbx A
@ -216,6 +240,26 @@ LoopE24H4:
jmp LoopE24H4
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
@ -224,6 +268,8 @@ popq %r9
popq %r8
popq %rbx
popq %rax
#endif
popq %rbp
retq

View File

@ -26,23 +26,29 @@ constexpr int AVX512F32 = 16;
_mm_store_ps(dest + AVX512F32 * packCUnit * ablock + 4 * packCUnit * aSegment + packCUnit * 3, m128_3); \
}
#define STORE_VECTOR_AS_COLUMN(dest, ablock, packCUnit, vacc) \
dest[AVX512F32 * packCUnit * ablock + 0] = vacc[0]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit] = vacc[1]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 2] = vacc[2]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 3] = vacc[3]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 4] = vacc[4]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 5] = vacc[5]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 6] = vacc[6]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 7] = vacc[7]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 8] = vacc[8]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 9] = vacc[9]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc[10]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc[11]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc[12]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc[13]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc[14]; \
dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc[15];
inline void STORE_VECTOR_AS_COLUMN(float* dest, size_t ablock, size_t packCUnit, __m512 vacc) {
union {
__m512 v;
float f[16];
} vacc_u;
vacc_u.v = vacc;
dest[AVX512F32 * packCUnit * ablock + 0] = vacc_u.f[0];
dest[AVX512F32 * packCUnit * ablock + packCUnit] = vacc_u.f[1];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 2] = vacc_u.f[2];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 3] = vacc_u.f[3];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 4] = vacc_u.f[4];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 5] = vacc_u.f[5];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 6] = vacc_u.f[6];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 7] = vacc_u.f[7];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 8] = vacc_u.f[8];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 9] = vacc_u.f[9];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc_u.f[10];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc_u.f[11];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc_u.f[12];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc_u.f[13];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc_u.f[14];
dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc_u.f[15];
}
#define TRANSPOSE4x8_STORE(dest, ablock, aSegment, packCUnit, v0, v3, v6, v9, v12, v15, v18, v21) { \
auto m0 = _mm512_extractf32x4_ps(v0, aSegment); \
@ -125,14 +131,20 @@ constexpr int AVX512F32 = 16;
_mm256_storeu_ps(dest + packCUnit * 7, t7); \
}
#define STORE_M256_VECTOR_AS_COLUMN(dest, packCUnit, vacc) \
dest[0] = vacc[0]; \
dest[packCUnit] = vacc[1]; \
dest[packCUnit * 2] = vacc[2]; \
dest[packCUnit * 3] = vacc[3]; \
dest[packCUnit * 4] = vacc[4]; \
dest[packCUnit * 5] = vacc[5]; \
dest[packCUnit * 6] = vacc[6]; \
dest[packCUnit * 7] = vacc[7];
inline void STORE_M256_VECTOR_AS_COLUMN(float* dest, size_t packCUnit, __m256 vacc) {
union {
__m256 v;
float f[8];
} vacc_u;
vacc_u.v = vacc;
dest[0] = vacc_u.f[0];
dest[packCUnit] = vacc_u.f[1];
dest[packCUnit * 2] = vacc_u.f[2];
dest[packCUnit * 3] = vacc_u.f[3];
dest[packCUnit * 4] = vacc_u.f[4];
dest[packCUnit * 5] = vacc_u.f[5];
dest[packCUnit * 6] = vacc_u.f[6];
dest[packCUnit * 7] = vacc_u.f[7];
}
#endif
#endif

View File

@ -228,9 +228,14 @@ void _AVX512_MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B,
vacc0 = _mm256_min_ps(vacc0, _mm512_extractf32x8_ps(vmax, 0));
vacc0 = _mm256_max_ps(vacc0, _mm512_extractf32x8_ps(vmin, 0));
union {
__m256 v;
float f[8];
} vacc0_u;
vacc0_u.v = vacc0;
// how to store faster: st4 / transpose
for (auto iStore = 0; iStore < (taileSize & 0x07); iStore++) {
c[packCUnit * iStore] = vacc0[iStore];
c[packCUnit * iStore] = vacc0_u.f[iStore];
}
}
// ie += taileSize;

View File

@ -647,10 +647,15 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));
c[0] = vacc0[0];
c[packCUnit] = vacc0[1];
c[packCUnit * 2] = vacc0[2];
c[+packCUnit * 3] = vacc0[3];
union {
__m128 v;
float f[4];
} vacc0_u;
vacc0_u.v = vacc0;
c[0] = vacc0_u.f[0];
c[packCUnit] = vacc0_u.f[1];
c[packCUnit * 2] = vacc0_u.f[2];
c[+packCUnit * 3] = vacc0_u.f[3];
}
ie += 4;
a += 4;
@ -735,8 +740,13 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));
c[0] = vacc0[0];
c[packCUnit] = vacc0[1];
union {
__m128 v;
float f[4];
} vacc0_u;
vacc0_u.v = vacc0;
c[0] = vacc0_u.f[0];
c[packCUnit] = vacc0_u.f[1];
}
ie += 2;
a += 2;

View File

@ -789,10 +789,15 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
vacc0 = _mm_min_ps(vacc0, vmax);
vacc0 = _mm_max_ps(vacc0, vmin);
c[0] = vacc0[0];
c[packCUnit] = vacc0[1];
c[packCUnit * 2] = vacc0[2];
c[packCUnit * 3] = vacc0[3];
union {
__m128 v;
float f[4];
} vacc0_u;
vacc0_u.v = vacc0;
c[0] = vacc0_u.f[0];
c[packCUnit] = vacc0_u.f[1];
c[packCUnit * 2] = vacc0_u.f[2];
c[packCUnit * 3] = vacc0_u.f[3];
}
ie += 4;
a += 4;
@ -877,8 +882,13 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
vacc0 = _mm_min_ps(vacc0, vmax);
vacc0 = _mm_max_ps(vacc0, vmin);
c[0] = vacc0[0];
c[packCUnit] = vacc0[1];
union {
__m128 v;
float f[4];
} vacc0_u;
vacc0_u.v = vacc0;
c[0] = vacc0_u.f[0];
c[packCUnit] = vacc0_u.f[1];
}
ie += 2;
a += 2;

View File

@ -11,16 +11,15 @@
.align 4
asm_function _AVX512_MNNGemmFloatUnit16x8
//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter)
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
pushq %rbp
movq %rsp, %rbp
pushq %rbx
#ifdef WIN32
movq 48(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -30,12 +29,21 @@ movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
pushq %r14
movq %r8, %r9
#endif
movq 40(%rcx), %r10 // bExtraStride
@ -266,6 +274,17 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r14
popq %r13
popq %r12

View File

@ -11,16 +11,15 @@
.align 4
asm_function _AVX512_MNNGemmFloatUnit32x8
//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter)
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
pushq %rbp
movq %rsp, %rbp
pushq %rbx
#ifdef WIN32
movq 48(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -30,12 +29,21 @@ movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
pushq %r14
movq %r8, %r9
#endif
movq 40(%rcx), %r10 // bExtraStride
@ -301,6 +309,17 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r14
popq %r13
popq %r12

View File

@ -11,16 +11,15 @@
.align 4
asm_function _AVX512_MNNGemmFloatUnit48x8
//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter)
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
pushq %rbp
movq %rsp, %rbp
pushq %rbx
#ifdef WIN32
movq 48(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -29,11 +28,20 @@ movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
movq %r8, %r9
#endif
movq 40(%rcx), %r10 // bExtraStride
@ -336,10 +344,22 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r13
popq %r12
popq %rsi
popq %rdi
popq %rbx
popq %rbp
#else
popq %r13

View File

@ -14,9 +14,22 @@ asm_function _AVX512_MNNGemmFloatUnit48x8Fused
//void _AVX512_MNNGemmFloatUnit48x8Fused(float* C, const float* A, const float* B, const size_t* parameter, const float* p, const float* bias)
// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: postParameters, r9:bias
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
// stack: postParameters, bias
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r8 // postParameters
movq (push_registers_bytes + 8)(%rsp), %r9 // bias
pushq %rbx
pushq %r12
pushq %r13
@ -24,6 +37,26 @@ pushq %r14
pushq %r15
movq %r8, %r14
movq %r9, %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r14
movq %r9, %r15
#endif
movq 40(%rcx), %r10 // bExtraStride
movq 24(%rcx), %r8 // cStride
@ -402,12 +435,33 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rbp
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
#endif
popq %rbp
retq

View File

@ -12,8 +12,6 @@
#define AVX512F32 16
#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
// caution: asm version is a sub-loop of _AVX512_MNNPackedSparseMatMulEpx4()
// void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
// const float* postParameters, const float* bias, unsigned int* NNZMap,
@ -22,8 +20,29 @@ asm_function _AVX512_MNNPackedSparseMatMulEpx4_ASM
// SystemV Auto: rdi: C, rsi: A, rdx:B, rcx: eSize, r8: parameter, r9: postparameter,
// stack: bias, unsigned int* NNZMap, int* dataOffsetMap
// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:eSize
// stack: parameter, postParameters, bias, unsigned int* NNZMap, int* dataOffsetMap
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
#define push_registers_bytes_ ((8 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes_)(%rsp), %r8 // parameter
movq (push_registers_bytes_ + 8)(%rsp), %r9 // postparameter
#define push_registers_bytes (push_registers_bytes_ + 2 * 8) // pushq + callq + shadow_space + extra
#else
pushq %rax
pushq %rbx
pushq %r8
@ -32,7 +51,8 @@ pushq %r12
pushq %r13
pushq %r14
pushq %r15
#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
#endif
movq (%r8), %r10 // eP * sizeof
shrq $(sizeof_value_lg2), %r10
@ -65,8 +85,8 @@ vbroadcastss 8(%r9), %zmm10
vbroadcastss 12(%r9), %zmm11
movq %r10, %r14
shrq $sparse_blockoc_log, %r14
shlq $sparse_blockoc_log, %r14 // h even divid sparse_blockoc
shrq $(sparse_blockoc_log), %r14
shlq $(sparse_blockoc_log), %r14 // h even divid sparse_blockoc
movq (push_registers_bytes)(%rsp), %rdx // bias
movq (push_registers_bytes + 8)(%rsp), %rdi // unsigned int* NNZMap,
@ -79,6 +99,20 @@ movq (push_registers_bytes + 16)(%rsp), %rsi // int* dataOffsetMap
// movq %r8, %rdi
// movq %r9, %rsi
#ifdef WIN32
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#endif
movslq (%rsi), %r15
leaq (%rax, %r15, 4), %rax // a = a + diff;
addq $4, %rsi // dataOffsetMap++
@ -90,7 +124,7 @@ je loop_e48h4_end
loop_e48h4:
movq %r8, %r9
movq %r8, %r12
shrq $packC_unit_log, %r9
shrq $(packC_unit_log), %r9
andq $15, %r12 // ih % packC_unit
leaq (%rcx, %r12, sizeof_value), %r12
imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@ -246,7 +280,7 @@ loop_e48h4:
subq $4, %rsi // dataOffsetMap--
movslq (%rsi), %r15
addq $sparse_blockoc, %r8
addq $(sparse_blockoc), %r8
addq $4, %rdi
negq %r15
leaq (%rax, %r15, sizeof_value), %rax // a = a - diff;
@ -284,7 +318,7 @@ je loop_end
loop_e48h1:
movq %r8, %r9
movq %r8, %r12
shrq $packC_unit_log, %r9
shrq $(packC_unit_log), %r9
andq $15, %r12 // ih % packC_unit
leaq (%rcx, %r12, sizeof_value), %r12
imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@ -433,15 +467,37 @@ loop_e48h1_end:
loop_end:
popq %r15
popq %r14
popq %r13
popq %r12
popq %r9
popq %r8
popq %rbx
popq %rax
popq %rbp
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
popq %r12
popq %r9
popq %r8
popq %rbx
popq %rax
#endif
popq %rbp
retq

View File

@ -21,7 +21,6 @@ pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -31,7 +30,17 @@ movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
@ -179,6 +188,17 @@ Loop:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r14
popq %r13
popq %r12

View File

@ -19,7 +19,8 @@ pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
#define push_registers_bytes ((1 + 1) * 8 + 32)
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -29,6 +30,17 @@ movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
@ -216,6 +228,17 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r13
popq %r12
popq %rsi

View File

@ -19,7 +19,8 @@ pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
movq 48(%rsp), %r10
#define push_registers_bytes ((1 + 1) * 8 + 32)
movq (push_registers_bytes)(%rsp), %r10
pushq %rdi
pushq %rsi
pushq %r12
@ -29,6 +30,17 @@ movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
movq %r10, %r9
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
@ -191,6 +203,17 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r13
popq %r12
popq %rsi

View File

@ -18,12 +18,41 @@ asm_function _AVX_MNNGemmFloatUnitMainFMA_Fused
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
movq (push_registers_bytes)(%rsp), %r8
movq (push_registers_bytes + 8)(%rsp), %r9
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r14
movq %r9, %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %r12
pushq %r13
pushq %r14
pushq %r15
movq %r8, %r14
movq %r9, %r15
#endif
movq 40(%rcx), %r10 // bExtraStride
movq 24(%rcx), %r8 // cStride
@ -232,10 +261,30 @@ LoopDz:
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
popq %r12
#endif
popq %rbp
retq

View File

@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1NFMA_ASM
// SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
// all callee save regs:
// %rbx, %rbp, %r12~%r15
// unused para regs: %r8, %r9
// can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %rax
pushq %rbx
pushq %r8
@ -42,6 +66,7 @@ pushq %r12
pushq %r13
pushq %r14
pushq %r15
#endif
movq (%rdi), %rax // %rax C
@ -203,6 +228,26 @@ LoopE24H1:
jmp LoopE24H1
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
@ -211,6 +256,8 @@ popq %r9
popq %r8
popq %rbx
popq %rax
#endif
popq %rbp
retq

View File

@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4NFMA_ASM
// SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
// all callee save regs:
// %rbx, %rbp, %r12~%r15
// unused para regs: %r8, %r9
// can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
pushq %rbp
movq %rsp, %rbp
#ifdef WIN32
pushq %rdi
pushq %rsi
movq %rcx, %rdi
movq %rdx, %rsi
movq %r8, %rdx
movq %r9, %rcx
pushq %rbx
pushq %r12
pushq %r13
pushq %r14
pushq %r15
leaq (-1280)(%rsp), %rsp
vmovdqu %xmm6, (128*0)(%rsp)
vmovdqu %xmm7, (128*1)(%rsp)
vmovdqu %xmm8, (128*2)(%rsp)
vmovdqu %xmm9, (128*3)(%rsp)
vmovdqu %xmm10, (128*4)(%rsp)
vmovdqu %xmm11, (128*5)(%rsp)
vmovdqu %xmm12, (128*6)(%rsp)
vmovdqu %xmm13, (128*7)(%rsp)
vmovdqu %xmm14, (128*8)(%rsp)
vmovdqu %xmm15, (128*9)(%rsp)
#else
pushq %rax
pushq %rbx
pushq %r8
@ -42,6 +66,7 @@ pushq %r12
pushq %r13
pushq %r14
pushq %r15
#endif
movq (%rdi), %rax // %rax C
movq 8(%rdi), %rbx // %rbx A
@ -195,6 +220,26 @@ LoopE24H4:
jmp LoopE24H4
End:
#ifdef WIN32
vmovdqu (128*0)(%rsp), %xmm6
vmovdqu (128*1)(%rsp), %xmm7
vmovdqu (128*2)(%rsp), %xmm8
vmovdqu (128*3)(%rsp), %xmm9
vmovdqu (128*4)(%rsp), %xmm10
vmovdqu (128*5)(%rsp), %xmm11
vmovdqu (128*6)(%rsp), %xmm12
vmovdqu (128*7)(%rsp), %xmm13
vmovdqu (128*8)(%rsp), %xmm14
vmovdqu (128*9)(%rsp), %xmm15
leaq (1280)(%rsp), %rsp
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbx
popq %rsi
popq %rdi
#else
popq %r15
popq %r14
popq %r13
@ -203,6 +248,8 @@ popq %r9
popq %r8
popq %rbx
popq %rax
#endif
popq %rbp
retq

View File

@ -56,15 +56,15 @@ message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!")
if(WIN32)
cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC})
string(REPLACE "cublas.lib" "cudnn.lib" CUDNN_LIBRARIES ${CUDA_CUBLAS_LIBRARIES})
set(MNN_CUDA_LIBS MNN_CUDA ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES} PARENT_SCOPE)
set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE)
else()
cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
set(MNN_CUDA_LIBS MNN_Cuda_Main cudnn cublas PARENT_SCOPE)
set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE)
add_library(MNN_CUDA OBJECT Register.cpp)
endif()
include_directories(
${CMAKE_CURRENT_LIST_DIR}/
${CUDA_INCLUDE_DIRS}
${CMAKE_SOURCE_DIR}/include/
)

View File

@ -14,6 +14,11 @@
#include "core/Macro.h"
#include "shape/SizeComputer.hpp"
#include "core/TensorUtils.hpp"
#include "execution/Raster.cuh"
#include "execution/Transpose.cuh"
#include "execution/MNNCUDADefine.hpp"
// #define MNN_CUDA_COPY_DEBUG
namespace MNN {
namespace CUDA {
@ -30,22 +35,18 @@ public:
// Do nothing
}
virtual ~ CUDARuntimeAllocator() = default;
virtual std::pair<void*, int> onAlloc(int size, int align) override {
virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
return std::make_pair(mRuntime->alloc(size), 0);
}
virtual void onRelease(std::pair<void*, int> ptr) override {
virtual void onRelease(std::pair<void*, size_t> ptr) override {
mRuntime->free(ptr.first);
}
private:
CUDARuntime* mRuntime;
};
CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
// Shader precision
if (precision == BackendConfig::Precision_Low) {
mCUDARuntime.reset(new CUDARuntime(true, -1));
} else {
mCUDARuntime.reset(new CUDARuntime(false, -1));
}
// TODO: Search CUDA Device info and use best one
mCUDARuntime.reset(new CUDARuntime(-1));
if (mCUDARuntime.get()) {
if (mCUDARuntime->isCreateError() == true) {
mIsCreateError = true;
@ -54,6 +55,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
mBufferPool.reset(new BufferAllocator(allocator));
}
mDefaultPrecision = precision;
}
CUDARuntimeWrapper::~CUDARuntimeWrapper() {
// Do nothing
@ -64,7 +66,12 @@ float CUDARuntimeWrapper::onGetMemoryInMB() {
}
Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
return new CUDABackend(mBufferPool, mCUDARuntime);
auto mode = mDefaultPrecision;
if (nullptr != config) {
mode = config->precision;
}
bool useFp16 = mode == BackendConfig::Precision_Low;
return new CUDABackend(mBufferPool, mCUDARuntime, useFp16);
}
void CUDARuntimeWrapper::onGabageCollect(int level) {
@ -72,11 +79,12 @@ void CUDARuntimeWrapper::onGabageCollect(int level) {
}
CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
std::shared_ptr<CUDARuntime> rt)
std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32)
: Backend(MNN_FORWARD_CUDA) {
mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
mStaticBufferPool = st;
mCUDARuntime = rt;
mUseFp16AsFp32 = useFp16AsFp32;
}
CUDABackend::~CUDABackend() {
@ -89,6 +97,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() {
MNN_ASSERT(nullptr != mCUDARuntime.get());
return mCUDARuntime.get();
}
bool CUDABackend::useFp16() const {
return mUseFp16AsFp32;
}
class CUDAMemObj : public Backend::MemObj {
public:
@ -103,12 +114,27 @@ private:
BufferAllocator* mAllocator;
std::pair<void*, int> mPoint;
};
int CUDABackend::getBytes(const Tensor* tensor) const {
auto bytes = tensor->getType().bytes();
if (mUseFp16AsFp32) {
if (halide_type_float == tensor->getType().code) {
bytes = 2;
}
}
return bytes;
}
CPUResizeCache* CUDABackend::getCache() {
return &mCache;
}
Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
#ifdef LOG_VERBOSE
MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
#endif
BufferAllocator* allocator = nullptr;
int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
auto bytes = getBytes(nativeTensor);
size_t mallocSize = realSize(nativeTensor) * bytes;
std::pair<void*, int> buffer;
if (storageType == DYNAMIC_SEPERATE) {
buffer = mBufferPool->alloc(mallocSize, true);
@ -132,13 +158,23 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
}
bool CUDABackend::onClearBuffer() {
mCache.reset();
mBufferPool->release(true);
return true;
}
size_t CUDABackend::realSize(const Tensor* tensor) {
auto dim = TensorUtils::getDescribe(tensor)->dimensionFormat;
int pack = 1;
if (dim == MNN_DATA_FORMAT_NC4HW4) {
pack = PACK_NUMBER;
}
size_t res = 1;
for (int i = 0; i < tensor->dimensions(); ++i) {
res *= tensor->length(i);
size_t l = tensor->length(i);
if (1 == i ) {
l = UP_DIV(l, pack) * pack;
}
res *= l;
}
return res;
}
@ -186,47 +222,332 @@ void CUDABackend::onExecuteBegin() const {
void CUDABackend::onExecuteEnd() const {
}
static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
srcStride[0] = plane * srcPack;
srcStride[1] = plane * batch * PACK_NUMBER;
srcStride[2] = srcPack;
} else if (srcDimensionFormat == MNN_DATA_FORMAT_NCHW) {
srcStride[0] = channel * plane;
srcStride[1] = plane * PACK_NUMBER;
srcStride[2] = 1;
} else {
srcStride[0] = channel * plane;
srcStride[1] = PACK_NUMBER;
srcStride[2] = channel;
}
}
static void _computeBCA(int& batch, int& plane, int& channel, MNN_DATA_FORMAT srcDimensionFormat, const Tensor* srcTensor) {
if (srcDimensionFormat != MNN_DATA_FORMAT_NHWC) {
batch = srcTensor->length(0);
channel = srcTensor->length(1);
plane = 1;
for (int i=2; i<srcTensor->dimensions(); ++i) {
plane *= srcTensor->length(i);
}
} else {
batch = srcTensor->length(0);
channel = srcTensor->length(srcTensor->dimensions()-1);
plane = 1;
for (int i=1; i<srcTensor->dimensions()-1; ++i) {
plane *= srcTensor->length(i);
}
}
}
static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch, int plane, int channel) {
PackInfo pack;
pack.inside = plane;
pack.axis = channel;
pack.unit = PACK_NUMBER;
pack.outside = batch;
if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
pack.axisStride = 1;
pack.insideStride = channel;
} else {
pack.axisStride = plane;
pack.insideStride = 1;
}
return pack;
}
void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
auto srcDevice = srcTensor->deviceId() != 0;
auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
auto srcDevice = srcTensor->deviceId() != 0;
auto dstDevice = dstTensor->deviceId() != 0;
if (srcDevice && srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
srcDimensionFormat = MNN_DATA_FORMAT_NCHW;
MNN_ASSERT(srcDevice || dstDevice);
uint8_t* srcPtr = nullptr;
std::pair<void*, int> tempSrcStorage;
auto bytes = getBytes(srcTensor);
auto type = srcTensor->getType();
#ifdef MNN_CUDA_COPY_DEBUG
MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat);
for (int i=0; i<srcTensor->dimensions(); ++i) {
MNN_PRINT("%d ", srcTensor->length(i));
}
if (dstDevice && dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
dstDimensionFormat = MNN_DATA_FORMAT_NCHW;
MNN_PRINT("]\n");
#endif
bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
if (mUseFp16AsFp32) {
if ((!srcDevice) || (!dstDevice)) {
if (type.code == halide_type_float) {
directCopy = false;
}
}
}
auto needSize = realSize(srcTensor) * srcTensor->getType().bytes();
std::shared_ptr<Tensor> srcTempTensor;
std::shared_ptr<Tensor> dstTempTensor;
if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize,
MNNMemcpyDeviceToDevice, true);
if (directCopy) {
auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
if (srcDevice && dstDevice) {
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToDevice, true);
} else if (srcDevice && (!dstDevice)) {
mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToHost, true);
} else if ((!srcDevice) && (dstDevice)) {
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
MNNMemcpyHostToDevice, true);
}
return;
}
if (!srcDevice) {
auto cpuSize = srcTensor->size();
tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
true);
} else {
srcPtr = (uint8_t*)srcTensor->deviceId();
}
uint8_t* dstPtr = nullptr;
std::pair<void*, int> tempDstStorage;
if (!dstDevice) {
auto cpuSize = dstTensor->size();
tempDstStorage = mStaticBufferPool->alloc(cpuSize);
dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
} else {
dstPtr = (uint8_t*)dstTensor->deviceId();
}
if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) {
if(srcDimensionFormat != dstDimensionFormat) {
dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true));
mCUDARuntime->memcpy(dstTempTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
true);
MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor);
// Format convert
FuseRegion reg;
int* size = reg.size;
int* srcStride = reg.srcStride;
int* dstStride = reg.dstStride;
int offset[PACK_NUMBER * 8];
int offsetNumber = 0;
auto offsetGpuStorage = mStaticBufferPool->alloc(PACK_NUMBER * 8 * sizeof(int));
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
auto regionStorage = mStaticBufferPool->alloc(sizeof(FuseRegion));
auto regionGpu = (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second);
do {
if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
if (srcTensor->dimensions() <= 1 || srcDimensionFormat == dstDimensionFormat) {
auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
MNNMemcpyDeviceToDevice, true);
} else {
int batch, plane, channel;
_computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
PackInfo pack;
auto func = PackBuffer;
if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
func = PackBuffer;
} else if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
func = UnpackBuffer;
} else {
FUNC_PRINT(1);
}
func((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), &pack, getBytes(srcTensor), mCUDARuntime.get());
}
break;
}
auto convertFunction = FuseRasterBlitFloatToFloat;
if (mUseFp16AsFp32) {
if (!srcDevice) {
convertFunction = FuseRasterBlitFloatToHalf;
} else {
convertFunction = FuseRasterBlitHalfToFloat;
}
}
if (srcTensor->dimensions() <= 1) {
size[2] = srcTensor->elementSize();
srcStride[2] = 1;
dstStride[2] = 1;
offset[0] = 1;
offset[1] = 1;
offset[2] = size[2];
offset[3] = 0;
offset[4] = 1;
offset[5] = 1;
offset[6] = size[2];
offset[7] = 0;
offsetNumber = 1;
} else {
mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
true);
// Compute batch, plane, channel
int batch, plane, channel;
_computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDevice) {
PackInfo pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
if (mUseFp16AsFp32) {
if (type.code == halide_type_float) {
if (dstDevice) {
PackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
break;
} else {
PackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
break;
}
}
} else {
PackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
}
break;
}
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && srcDevice) {
PackInfo pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
if (mUseFp16AsFp32) {
if (type.code == halide_type_float) {
if (dstDevice) {
UnpackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
break;
} else {
UnpackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
break;
}
}
} else {
UnpackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
}
break;
}
//MNN_PRINT("host/device: %d -> %d, format %d -> %d, b, p, c: %d - %d - %d\n", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat, batch, plane, channel);
// Set region
if (srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
size[0] = batch;
size[1] = channel;
size[2] = plane;
offsetNumber = 1;
offset[0] = batch;
offset[1] = channel;
offset[2] = plane;
offset[3] = 0;
offset[4] = batch;
offset[5] = channel;
offset[6] = plane;
offset[7] = 0;
if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
srcStride[0] = channel * plane;
srcStride[1] = 1;
srcStride[2] = channel;
} else {
srcStride[0] = channel * plane;
srcStride[1] = plane;
srcStride[2] = 1;
}
if (dstDimensionFormat == MNN_DATA_FORMAT_NHWC) {
dstStride[0] = channel * plane;
dstStride[1] = 1;
dstStride[2] = channel;
} else {
dstStride[0] = channel * plane;
dstStride[1] = plane;
dstStride[2] = 1;
}
} else {
offsetNumber = PACK_NUMBER;
size[0] = batch;
size[1] = UP_DIV(channel, PACK_NUMBER);
size[2] = plane;
int srcPack = 1;
int dstPack = 1;
int srcChannelLimit = channel;
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
if (srcDevice) {
srcPack = PACK_NUMBER;
srcChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
} else {
srcPack = 4;
srcChannelLimit = UP_DIV(channel, 4) * 4;
}
}
int dstChannelLimit = channel;
if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
if (dstDevice) {
dstPack = PACK_NUMBER;
dstChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
} else {
dstPack = 4;
dstChannelLimit = UP_DIV(channel, 4) * 4;
}
}
// Compute Stride
_computeStride(srcDimensionFormat, srcStride, batch, plane, channel, srcPack);
_computeStride(dstDimensionFormat, dstStride, batch, plane, channel, dstPack);
// Compute Offset
for (int i=0; i<offsetNumber; ++i) {
auto offsetPtr = offset + i * 8;
int channelStart = i;
offsetPtr[0] = batch;
offsetPtr[1] = (srcChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
offsetPtr[2] = plane;
if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
int sp = i / srcPack;
int sm = i % srcPack;
offsetPtr[3] = sm + sp * srcPack * plane * batch;
} else {
offsetPtr[3] = channelStart * srcStride[1] / PACK_NUMBER;
}
offsetPtr[4] = batch;
offsetPtr[5] = (dstChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
offsetPtr[6] = plane;
if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
int sp = i / dstPack;
int sm = i % dstPack;
offsetPtr[7] = sm + sp * dstPack * plane * batch;
} else {
offsetPtr[7] = channelStart * dstStride[1] / PACK_NUMBER;
}
}
}
}
reg.fuseNumber = offsetNumber;
mCUDARuntime->memcpy(regionGpu, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
mCUDARuntime->memcpy(offsetGpu, offset, offsetNumber * 8 * sizeof(int), MNNMemcpyHostToDevice, true);
#ifdef MNN_CUDA_COPY_DEBUG
MNN_PRINT("Reg.size: %d - %d - %d\n", reg.size[0], reg.size[1], reg.size[2]);
MNN_PRINT("Reg.srcStride: %d - %d - %d\n", reg.srcStride[0], reg.srcStride[1], reg.srcStride[2]);
MNN_PRINT("Reg.dstStride: %d - %d - %d\n", reg.dstStride[0], reg.dstStride[1], reg.dstStride[2]);
MNN_PRINT("FuseNum: %d\n", reg.fuseNumber);
for (int i=0; i<reg.fuseNumber; ++i) {
auto off = offset + 8 * i;
MNN_PRINT("Src: %d, %d, %d, %d; dst:%d, %d, %d, %d\n", off[0], off[1], off[2], off[3], off[4], off[5], off[6], off[7]);
}
#endif
if (mUseFp16AsFp32) {
if (type.code == halide_type_float) {
convertFunction(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get());
break;
}
}
FuseRasterBlitCommon(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get(), type.bytes());
} while(false);
mStaticBufferPool->free(offsetGpuStorage);
mStaticBufferPool->free(regionStorage);
if (!srcDevice) {
mStaticBufferPool->free(tempSrcStorage);
}
if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
if (srcDimensionFormat != dstDimensionFormat) {
srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true));
MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
srcTensor = srcTempTensor.get();
}
mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host<void>(), needSize, MNNMemcpyHostToDevice,
if (!dstDevice) {
auto cpuSize = dstTensor->size();
mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
true);
mStaticBufferPool->free(tempDstStorage);
}
return;
}

View File

@ -17,6 +17,7 @@
#include "core/Macro.h"
#include "core/ConvolutionCommon.hpp"
#include "core/BufferAllocator.hpp"
#include "backend/cpu/CPUResizeCache.hpp"
namespace MNN {
namespace CUDA {
class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
@ -37,11 +38,12 @@ private:
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime;
bool mIsCreateError{false};
BackendConfig::PrecisionMode mDefaultPrecision;
};
class CUDABackend : public Backend {
public:
CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt);
CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32);
~CUDABackend();
CUDARuntime *getCUDARuntime();
@ -74,11 +76,15 @@ public:
return mStaticBufferPool.get();
}
static size_t realSize(const Tensor *tensor);
int getBytes(const Tensor* tensor) const;
CPUResizeCache* getCache();
bool useFp16() const;
private:
std::shared_ptr<BufferAllocator> mBufferPool;
std::shared_ptr<BufferAllocator> mStaticBufferPool;
std::shared_ptr<CUDARuntime> mCUDARuntime;
CPUResizeCache mCache;
bool mUseFp16AsFp32 = false;
};
template <class T>

View File

@ -15,17 +15,11 @@
#include <utility>
#include <vector>
#include "core/Macro.h"
// #define MNN_CUDA_USE_BLAS
//#define MNN_OPEN_TIME_TRACE
#include <MNN/AutoTime.hpp>
#define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x)
// #define LOG_VERBOSE
#define CUDNN_VERSION_STR STR(CUDNN_MAJOR) "." STR(CUDNN_MINOR) "." STR(CUDNN_PATCHLEVEL)
#pragma message "compile with cuda " STR(CUDART_VERSION) " "
#pragma message "compile with cuDNN " CUDNN_VERSION_STR " "
static_assert(!(CUDNN_MAJOR == 5 && CUDNN_MINOR == 1), "cuDNN 5.1.x series has bugs. Use 5.0.x instead.");
#undef STR
#undef STR_HELPER
@ -36,7 +30,7 @@ bool CUDARuntime::isCreateError() const {
return mIsCreateError;
}
CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
CUDARuntime::CUDARuntime(int device_id) {
#ifdef LOG_VERBOSE
MNN_PRINT("start CUDARuntime !\n");
#endif
@ -49,42 +43,39 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
mDeviceId = id;
cuda_check(cudaGetDeviceProperties(&mProp, id));
MNN_ASSERT(mProp.maxThreadsPerBlock > 0);
#ifdef MNN_CUDA_USE_BLAS
cublas_check(cublasCreate(&mCublasHandle));
// Set stream for cuDNN and cublas handles.
// Note that all cublas scalars (alpha, beta) and scalar results such as dot
// output resides at device side.
cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST));
cudnn_check(cudnnCreate(&mCudnnHandle));
#endif
}
CUDARuntime::~CUDARuntime() {
#ifdef LOG_VERBOSE
MNN_PRINT("start ~CUDARuntime !\n");
#endif
#ifdef MNN_CUDA_USE_BLAS
cublas_check(cublasDestroy(mCublasHandle));
cudnn_check(cudnnDestroy(mCudnnHandle));
#endif
#ifdef LOG_VERBOSE
MNN_PRINT("end ~CUDARuntime !\n");
#endif
}
int CUDARuntime::blocks_num(const int total_threads) {
int maxNum = mProp.maxThreadsPerBlock;
if(total_threads / 32 > maxNum) {
mThreadPerBlock = maxNum;
} else if(total_threads / 16 > maxNum) {
mThreadPerBlock = maxNum / 2;
} else if(total_threads / 8 > maxNum) {
mThreadPerBlock = maxNum / 4;
} else if(total_threads / 4 > maxNum) {
mThreadPerBlock = maxNum / 8;
} else {
mThreadPerBlock = 128;
}
size_t CUDARuntime::blocks_num(const size_t total_threads) {
// size_t maxNum = mProp.maxThreadsPerBlock;
// if(total_threads / 32 > maxNum) {
// mThreadPerBlock = maxNum;
// } else if(total_threads / 16 > maxNum) {
// mThreadPerBlock = maxNum / 2;
// } else if(total_threads / 8 > maxNum) {
// mThreadPerBlock = maxNum / 4;
// } else if(total_threads / 4 > maxNum) {
// mThreadPerBlock = maxNum / 8;
// } else {
// mThreadPerBlock = 128;
// }
mThreadPerBlock = 128;
return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
}
@ -148,13 +139,4 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe
void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) {
cuda_check(cudaMemset(dst, value, size_in_bytes));
}
cublasHandle_t CUDARuntime::cublas_handle() {
return mCublasHandle;
}
cudnnHandle_t CUDARuntime::cudnn_handle() {
return mCudnnHandle;
}
} // namespace MNN

View File

@ -16,19 +16,14 @@
#include <string>
#include <vector>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cudnn.h>
#include <cusolverDn.h>
#include <sstream>
#include <string>
#include <vector>
#include "Type_generated.h"
#include "core/Macro.h"
#if CUDA_VERSION >= 10010
#include <cublasLt.h>
#endif
typedef enum {
CUDA_FLOAT32 = 0,
@ -49,40 +44,30 @@ typedef enum {
} \
} while (0)
#define cublas_check(_x) \
do { \
cublasStatus_t _err = (_x); \
if (_err != CUBLAS_STATUS_SUCCESS) { \
MNN_CHECK(_err, #_x); \
} \
} while (0)
#define cudnn_check(_x) \
do { \
cudnnStatus_t _err = (_x); \
if (_err != CUDNN_STATUS_SUCCESS) { \
MNN_CHECK(_err, #_x); \
} \
} while (0)
#define cusolver_check(_x) \
do { \
cusolverStatus_t _err = (_x); \
if (_err != CUSOLVER_STATUS_SUCCESS) { \
MNN_CHECK(_err, #_x); \
} \
} while (0)
#define after_kernel_launch() \
do { \
cuda_check(cudaGetLastError()); \
} while (0)
#ifdef DEBUG
#define checkKernelErrors\
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
printf("File:%s Line %d: failed: %s\n", __FILE__, __LINE__,\
cudaGetErrorString(__err)); \
abort(); \
} \
} while (0)
#else
#define checkKernelErrors
#endif
namespace MNN {
class CUDARuntime {
public:
CUDARuntime(bool permitFloat16, int device_id);
CUDARuntime(int device_id);
~CUDARuntime();
CUDARuntime(const CUDARuntime &) = delete;
CUDARuntime &operator=(const CUDARuntime &) = delete;
@ -105,16 +90,14 @@ public:
void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false);
void memset(void *dst, int value, size_t size_in_bytes);
cublasHandle_t cublas_handle();
cudnnHandle_t cudnn_handle();
int threads_num() {
size_t threads_num() {
return mThreadPerBlock;
}
int major_sm() const {
return mProp.major;
}
int blocks_num(const int total_threads);
size_t blocks_num(const size_t total_threads);
const cudaDeviceProp& prop() const {
return mProp;
}
@ -123,15 +106,12 @@ private:
cudaDeviceProp mProp;
int mDeviceId;
cublasHandle_t mCublasHandle;
cudnnHandle_t mCudnnHandle;
bool mIsSupportedFP16 = false;
bool mSupportDotInt8 = false;
bool mSupportDotAccInt8 = false;
float mFlops = 4.0f;
bool mIsCreateError{false};
int mThreadPerBlock = 128;
size_t mThreadPerBlock = 128;
};
} // namespace MNN

View File

@ -1,119 +0,0 @@
#include "BatchMatMulExecution.hpp"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void add_bias(T *input, T *output, const T* bias, int batch, int e, int h) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) {
int i = index % (e*h);
int b = index / (e*h);
int y = i % h;
output[index] = input[index] + bias[b * h + y];
}
return;
}
BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) {
mTransposeA = transposeA;
mTransposeB = transposeB;
}
BatchMatMulExecution::~ BatchMatMulExecution() {
// do nothing
}
ErrorCode BatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto C = outputs[0];
auto dimensions = C->dimensions();
int batch = 1;
for (int i = 0; i < dimensions - 2; ++i) {
batch *= C->length(i);
}
auto e = C->length(dimensions-2);
auto h = C->length(dimensions-1);
if(inputs.size() > 2) {
mTempOutput.reset(Tensor::createDevice<float>({batch*h*e}));
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
ErrorCode BatchMatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto blasHandle = runtime->cublas_handle();
const Tensor* A = inputs[0];
const Tensor* B = inputs[1];
auto dimensions = A->dimensions();
int batch = 1;
for (int i = 0; i < dimensions - 2; ++i) {
batch *= A->length(i);
}
auto w0 = inputs[0]->length(dimensions-1);
auto h0 = inputs[0]->length(dimensions-2);
auto C = outputs[0];
auto e = C->length(dimensions-2);
auto h = C->length(dimensions-1);
auto l = w0;
if (mTransposeA) {
l = h0;
}
auto APtr = (const float*)A->deviceId();
auto BPtr = (const float*)B->deviceId();
auto CDestPtr = (float*)C->deviceId();
float alpha = 1.0f;
float beta = 0.0f;
auto tranB = CUBLAS_OP_N;
auto ldB = h;
if (mTransposeB) {
ldB = l;
tranB = CUBLAS_OP_T;
}
auto tranA = CUBLAS_OP_N;
auto ldA = l;
if (mTransposeA) {
ldA = e;
tranA = CUBLAS_OP_T;
}
// [b, e, l] x [b, l, h] -> [b, e, h]
if(inputs.size() == 2) {
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch);
cublas_check(status);
//cudaThreadSynchronize();
} else {
auto CPtr = (float*)mTempOutput->deviceId();
auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch);
cublas_check(status);
//cudaThreadSynchronize();
//add bias: [b, e, h] + [b, h] -> [b, e, h]
int block_num = runtime->blocks_num(batch*e*h);
int threads_num = runtime->threads_num();
add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h);
}
return NO_ERROR;
}
class BatchMatMulCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto param = op->main_as_BatchMatMulParam();
return new BatchMatMulExecution(param->adjX(), param->adjY(), backend);
}
};
static CUDACreatorRegister<BatchMatMulCreator> __init(OpType_BatchMatMul);
}
}

View File

@ -1,23 +0,0 @@
#ifndef BatchMatMulExecution_hpp
#define BatchMatMulExecution_hpp
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
class BatchMatMulExecution : public Execution {
public:
BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend);
virtual ~BatchMatMulExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
private:
std::shared_ptr<Tensor> mTempOutput;
bool mTransposeA;
bool mTransposeB;
};
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -50,11 +50,16 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
int stride0[3] = {0, 0, s0};
int stride1[3] = {0, 0, s1};
int stride2[3] = {0, 0, 1};
auto type = outputs[0]->getType();
if (type.code == halide_type_float) {
// Use Half or float
type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
}
auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
auto input0 = (uint8_t*)input0T->deviceId();
auto input1 = (uint8_t*)input1T->deviceId();
auto output = (uint8_t*)outputT->deviceId();
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, outputT->getType(), runtime, mType);
BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType);
};
computeFunction(inputs[0], inputs[1], outputs[0]);
for (int i=2; i<inputs.size(); ++i) {

View File

@ -1,61 +1,324 @@
#include "ConvDepthWiseExecution.hpp"
#include "core/ConvolutionCommon.hpp"
#include "Raster.cuh"
#include <float.h>
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
struct constBuffer {
int pad[2];
int kernelSize[2];
int stride[2];
int dilate[2];
int inputSize[2];
int outputSize[2];
int channel;
int subChannel;
int total;
int activationType;
} uConstant;
#define PACK_NUMBER_C2 (PACK_NUMBER/2)
ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn) : Execution(bn) {
#define MNN_CUDA_HALF2_MAX(a, b) \
do { \
(a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
(a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
} while (0)
#define MNN_CUDA_HALF2_MIN(a, b) \
do { \
(a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
(a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
} while (0)
__global__ void CONV_DW_HALF(const half2* input, const half2* kernel, const half2* bias, half2 *output, const constBuffer* uConstant) {
half2 maxV = half2(uConstant->maxValue, uConstant->maxValue);
half2 minV = half2(uConstant->minValue, uConstant->minValue);
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int dw = uConstant->dilate[0];
int dh = uConstant->dilate[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
int i = index / PACK_NUMBER_C2;
int zR = index % PACK_NUMBER_C2;
int oz = i / (ow * oh);
int tmp = i % (ow * oh);
int oy = tmp / ow;
int ox = tmp % ow;
int kz = oz / uConstant->batch;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
half2 color = bias[kz * PACK_NUMBER_C2 + zR];
int fxSta = max(0, (UP_DIV(-ix, dw)));
int fySta = max(0, (UP_DIV(-iy, dh)));
int fxEnd = min(kw, UP_DIV(iw - ix, dw));
int fyEnd = min(kh, UP_DIV(ih - iy, dh));
int fx, fy, fz;
for (fy=fySta; fy<fyEnd; ++fy) {
int sy = fy*dh + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx*dw + ix;
half2 inp = input[0
+ sx * PACK_NUMBER_C2
+ sy * iw * PACK_NUMBER_C2
+ oz * iw * ih * PACK_NUMBER_C2
+ zR
];
half2 ker = kernel[0
+ fx * PACK_NUMBER_C2
+ fy * kw * PACK_NUMBER_C2
+ kz * kw * kh * PACK_NUMBER_C2
+ zR
];
color = __hfma2(inp, ker, color);
}
}
MNN_CUDA_HALF2_MAX(color, minV);
MNN_CUDA_HALF2_MIN(color, maxV);
output[0
+ zR
+ ox * PACK_NUMBER_C2
+ oy * ow * PACK_NUMBER_C2
+ oz * ow * oh * PACK_NUMBER_C2
] = color;
}
}
__global__ void CONV_DW(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant) {
float maxV = uConstant->maxValue;
float minV = uConstant->minValue;
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int dw = uConstant->dilate[0];
int dh = uConstant->dilate[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
int i = index / PACK_NUMBER;
int zR = index % PACK_NUMBER;
int oz = i / (ow * oh);
int tmp = i % (ow * oh);
int oy = tmp / ow;
int ox = tmp % ow;
int kz = oz / uConstant->batch;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
float color = bias[kz * PACK_NUMBER + zR];
int fxSta = max(0, (UP_DIV(-ix, dw)));
int fySta = max(0, (UP_DIV(-iy, dh)));
int fxEnd = min(kw, UP_DIV(iw - ix, dw));
int fyEnd = min(kh, UP_DIV(ih - iy, dh));
int fx, fy, fz;
for (fy=fySta; fy<fyEnd; ++fy) {
int sy = fy*dh + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx*dw + ix;
float inp = input[0
+ sx * PACK_NUMBER
+ sy * iw * PACK_NUMBER
+ oz * iw * ih * PACK_NUMBER
+ zR
];
float ker = kernel[0
+ fx * PACK_NUMBER
+ fy * kw * PACK_NUMBER
+ kz * kw * kh * PACK_NUMBER
+ zR
];
color = color + inp * ker;
}
}
color = max(color, minV);
color = min(color, maxV);
output[0
+ zR
+ ox * PACK_NUMBER
+ oy * ow * PACK_NUMBER
+ oz * ow * oh * PACK_NUMBER
] = color;
}
}
__global__ void CONV_DW_OPT(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant,
DivModFast d_owh,
DivModFast d_ow,
DivModFast d_ob
) {
float maxV = uConstant->maxValue;
float minV = uConstant->minValue;
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
int i = index >> 4;
int zR = index & 15;
int oz, tmp, oy, ox, kz, unuse;
d_owh.divmod(i, oz, tmp);
d_ow.divmod(tmp, oy, ox);
d_ob.divmod(oz, kz, unuse);
int ix = ox * sw - pw;
int iy = oy * sh - ph;
float color = bias[(kz << 4) + zR];
int fxSta = max(0, -ix);
int fySta = max(0, -iy);
int fxEnd = min(kw, iw - ix);
int fyEnd = min(kh, ih - iy);
int fx, fy, fz;
for (fy=fySta; fy<fyEnd; ++fy) {
int sy = fy + iy;
for (fx=fxSta; fx<fxEnd; ++fx) {
int sx = fx + ix;
float inp = input[0
+ ((sx + iw * (sy + oz * ih)) << 4)
+ zR
];
float ker = kernel[0
+ ((fx + kw * (fy + kz * kh)) << 4)
+ zR
];
color = color + inp * ker;
}
}
color = max(color, minV);
color = min(color, maxV);
output[index] = color;
}
return;
}
static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op* op, Backend* bn) {
std::shared_ptr<ConvDepthWiseExecution::Resource> res(new ConvDepthWiseExecution::Resource);
auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
auto conv = op->main_as_Convolution2D();
auto convCommon = conv->common();
int kernelX = convCommon->kernelX();
int kernelY = convCommon->kernelY();
int depth = convCommon->outputCount();
int depthC = UP_DIV(depth, PACK_NUMBER);
res->weightTensor.reset(Tensor::createDevice<float>({kernelX * kernelY * depthC * PACK_NUMBER}));
bool success = bn->onAcquireBuffer(res->weightTensor.get(), Backend::STATIC);
if (!success) {
return nullptr;
}
res->mFilter = (void *)res->weightTensor.get()->buffer().device;
FuseRegion reg;
int offset[8 * PACK_NUMBER];
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
//weight host->device
const float* filterDataPtr = nullptr;
int weightSize = 0;
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
auto tempWeightStorage = pool->alloc(weightSize * sizeof(float));
auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
reg.size[0] = 1;
reg.size[1] = depthC;
reg.size[2] = kernelX * kernelY;
reg.srcStride[0] = 0;
reg.srcStride[1] = PACK_NUMBER * kernelX * kernelY;
reg.srcStride[2] = 1;
reg.dstStride[0] = 0;
reg.dstStride[1] = kernelX * kernelY * PACK_NUMBER;
reg.dstStride[2] = PACK_NUMBER;
reg.fuseNumber = PACK_NUMBER;
for (int v=0; v<PACK_NUMBER; ++v) {
auto off = offset + 8 * v;
// Src
off[0] = 1;
off[1] = (depth + PACK_NUMBER - v - 1) / PACK_NUMBER;
off[2] = reg.size[2];
off[3] = v * kernelX * kernelY;
// Dst
off[4] = 1;
off[5] = depthC;
off[6] = reg.size[2];
off[7] = v;
}
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * PACK_NUMBER * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
pool->free(tempWeightStorage);
res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
res->mBias = (void *)res->biasTensor.get()->buffer().device;
if (!success) {
return nullptr;
}
if(conv->bias() != nullptr) {
auto tempBiasStorage = pool->alloc(depth * sizeof(float));
auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
reg.size[0] = 1;
reg.size[1] = 1;
reg.size[2] = depthC * PACK_NUMBER;
reg.srcStride[0] = 0;
reg.srcStride[1] = 0;
reg.srcStride[2] = 1;
reg.dstStride[0] = 0;
reg.dstStride[1] = 0;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = 1;
offset[2] = conv->bias()->size();
offset[3] = 0;
offset[4] = 1;
offset[5] = 1;
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
pool->free(tempBiasStorage);
}
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
return res;
}
ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn, std::shared_ptr<Resource> resource) : Execution(bn) {
mOp = op;
mResource = resource;
auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
mConstBuffer = pool->alloc(sizeof(constBuffer));
auto conv = mOp->main_as_Convolution2D();
//weight host->device
if(nullptr != conv->weight()) {
int weightSize = conv->weight()->size();
weightTensor.reset(Tensor::createDevice<float>({weightSize}));
backend()->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
mFilter = (void *)weightTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mFilter, conv->weight()->data(), conv->weight()->size()*sizeof(float), cudaMemcpyHostToDevice));
mBias = nullptr;
if(conv->bias()->size() != 0) {
int biasSize = conv->bias()->size();
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
backend()->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
mBias = (void *)biasTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
use_bias_ = true;
}
}
}
ConvDepthWiseExecution::~ ConvDepthWiseExecution() {
auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
pool->free(mConstBuffer);
if (nullptr != weightTensor) {
backend()->onReleaseBuffer(weightTensor.get(), Backend::STATIC);
}
if(use_bias_ && nullptr != biasTensor) {
backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC);
}
}
ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mOp->main_as_Convolution2D()->common());
auto conv = mOp->main_as_Convolution2D();
auto convCommon = mOp->main_as_Convolution2D()->common();
constBuffer parameters;
int channel = inputs[0]->channel();
int channelDiv = UP_DIV(channel, PACK_NUMBER);
parameters.pad[0] = pad.first;
parameters.pad[1] = pad.second;
parameters.kernelSize[0] = convCommon->kernelX();
@ -66,233 +329,82 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
parameters.dilate[1] = convCommon->dilateY();
parameters.inputSize[0] = inputs[0]->width();
parameters.inputSize[1] = inputs[0]->height();
parameters.channel = inputs[0]->batch() * inputs[0]->channel();
parameters.channel = inputs[0]->batch() * channelDiv;
parameters.outputSize[0] = outputs[0]->width();
parameters.outputSize[1] = outputs[0]->height();
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
parameters.subChannel = inputs[0]->channel();
parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0);
if (static_cast<CUDABackend*>(backend())->useFp16()) {
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER_C2;
} else {
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER;
parameters.minValue = -FLT_MAX;
parameters.maxValue = FLT_MAX;
}
parameters.batch = inputs[0]->batch();
if (convCommon->relu()) {
parameters.minValue = 0.0f;
}
if (convCommon->relu6()) {
parameters.minValue = 0.0f;
parameters.maxValue = 6.0f;
}
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
mTotalCount = parameters.total;
//printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
return NO_ERROR;
}
__global__ void CONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
{
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int dw = uConstant->dilate[0];
int dh = uConstant->dilate[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
int acttype = uConstant->activationType;
int oz = i / (ow * oh);
int tmp = i % (ow * oh);
int oy = tmp / ow;
int ox = tmp % ow;
int kz = oz % uConstant->subChannel;
int ix = ox * sw - pw;
int iy = oy * sh - ph;
float color = 0.0;
if (bias != nullptr) {
color = bias[kz];
}
int fx, fy, fz;
for (fy=0; fy<kh; ++fy) {
int sy = fy*dh + iy;
if (sy >= ih || sy < 0) {
continue;
}
for (fx=0; fx<kw; ++fx) {
int sx = fx*dw + ix;
if (sx >= iw || sx < 0) {
continue;
}
float inputValue = input[0
+ sx
+ sy * iw
+ oz * iw * ih
];
float k = kernel[0
+ fx
+ fy * kw
+ kz * kw * kh
];
color += k*inputValue;
}
}
color = (acttype==1) ? max(0.0, color) : (acttype==2 ? (min(max(0.0, color), 6.0)) : color);
output[0
+ ox
+ oy * ow
+ oz * ow * oh
] = color;
}
}
return;
}
ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int limitThreads = UP_DIV(mTotalCount, prop.multiProcessorCount);
int threads_num = ALIMIN(prop.maxThreadsPerBlock, limitThreads);
int block_num = prop.multiProcessorCount;
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
if (inputs.size() == 1) {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)mFilter,
(const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
} else if (inputs.size() == 3) {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
} else {
MNN_ASSERT(inputs.size() == 2);
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if (inputs.size() == 1) {
CONV_DW_HALF<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
(const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
}
return NO_ERROR;
}
return NO_ERROR;
}
__global__ void DECONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
{
int iw = uConstant->inputSize[0];
int ih = uConstant->inputSize[1];
int c = uConstant->channel;
int ow = uConstant->outputSize[0];
int oh = uConstant->outputSize[1];
int kw = uConstant->kernelSize[0];
int kh = uConstant->kernelSize[1];
int dw = uConstant->dilate[0];
int dh = uConstant->dilate[1];
int sw = uConstant->stride[0];
int sh = uConstant->stride[1];
int pw = uConstant->pad[0];
int ph = uConstant->pad[1];
int oz = i / (ow * oh);
int tmp = i % (ow * oh);
int oy = tmp / ow;
int ox = tmp % ow;
int kz = oz % uConstant->subChannel;
if (inputs.size() == 1) {
// block_num = runtime->blocks_num(mTotalCount);
// threads_num = runtime->threads_num();
if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
const int area = parameters.outputSize[0] * parameters.outputSize[1];
DivModFast d_owh(area);
DivModFast d_ow(parameters.outputSize[0]);
DivModFast d_ob(outputs[0]->batch());
int ix = ox + pw;
int iy = oy + ph;
float color = 0.0;
if (bias != nullptr) {
color = bias[kz];
}
int fx, fy, fz;
for (fy=0; fy<kh; ++fy) {
int sy = iy - fy*dh;
int y = sy / sh;
if (sy % sh == 0 && y >= 0 && y < ih) {
for (int fx=0; fx<kw; ++fx) {
int sx = ix - fx*dw;
int x = sx / sw;
if (sx % sw == 0 && x >= 0 && x < iw) {
float inputValue = input[0
+ x
+ y * iw
+ oz * iw * ih
];
float k = kernel[0
+ fx
+ fy * kw
+ kz * kw * kh
];
color += k*inputValue;
}
}
}
}
output[0
+ ox
+ oy * ow
+ oz * ow * oh
] = color;
CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
d_owh, d_ow, d_ob);
} else {
CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
(const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
}
}
return;
}
ErrorCode DeconvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto convCommon = mOp->main_as_Convolution2D()->common();
auto pad = ConvolutionCommon::convolutionTransposePad(inputs[0], outputs[0], convCommon);
constBuffer parameters;
parameters.pad[0] = pad.first;
parameters.pad[1] = pad.second;
parameters.kernelSize[0] = convCommon->kernelX();
parameters.kernelSize[1] = convCommon->kernelY();
parameters.stride[0] = convCommon->strideX();
parameters.stride[1] = convCommon->strideY();
parameters.dilate[0] = convCommon->dilateX();
parameters.dilate[1] = convCommon->dilateY();
parameters.inputSize[0] = inputs[0]->width();
parameters.inputSize[1] = inputs[0]->height();
parameters.channel = inputs[0]->batch() * inputs[0]->channel();
parameters.outputSize[0] = outputs[0]->width();
parameters.outputSize[1] = outputs[0]->height();
parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
parameters.subChannel = inputs[0]->channel();
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
runtime->memcpy(constPtr, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
mTotalCount = parameters.total;
return NO_ERROR;
}
ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int block_num = runtime->blocks_num(mTotalCount);
int threads_num = runtime->threads_num();
auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
if (inputs.size() > 2) {
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
(const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
} else {
DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
}
return NO_ERROR;
}
class ConvDepthWiseExecutionCreator : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
if (OpType_ConvolutionDepthwise == op->type()) {
return new ConvDepthWiseExecution(op, backend);
}
if (inputs.size() == 1) {
MNN_PRINT("deconv depthwise not support 1 input yet\n");
if (inputs.size() > 1) {
return nullptr;
}
return new DeconvDepthWiseExecution(op, backend);
auto res = _makeResource(op, backend);
if (nullptr == res) {
return nullptr;
}
return new ConvDepthWiseExecution(op, backend, res);
}
};
static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init(OpType_ConvolutionDepthwise);
static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init2(OpType_DeconvolutionDepthwise);
}
}

View File

@ -14,9 +14,30 @@
#include "core/Execution.hpp"
namespace MNN {
namespace CUDA {
struct constBuffer {
int pad[2];
int kernelSize[2];
int stride[2];
int dilate[2];
int inputSize[2];
int outputSize[2];
int channel;
int total;
int batch;
float minValue = -65504.0f;
float maxValue = 65504.0f;
} uConstant;
class ConvDepthWiseExecution : public Execution {
public:
ConvDepthWiseExecution(const Op *op, Backend *bn);
struct Resource {
std::shared_ptr<Tensor> weightTensor;
std::shared_ptr<Tensor> biasTensor;
void* mFilter;
void* mBias;
};
ConvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource);
virtual ~ConvDepthWiseExecution();
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -25,17 +46,13 @@ protected:
std::pair<void*, int> mConstBuffer;
const Op *mOp;
int mTotalCount;
void* mFilter;
void* mBias;
std::shared_ptr<Tensor> weightTensor;
std::shared_ptr<Tensor> biasTensor;
bool use_bias_=false;
constBuffer parameters;
std::shared_ptr<Resource> mResource;
};
class DeconvDepthWiseExecution : public ConvDepthWiseExecution {
public:
DeconvDepthWiseExecution(const Op *op, Backend *bn) : ConvDepthWiseExecution(op, bn) {
DeconvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource) : ConvDepthWiseExecution(op, bn, resource) {
// Do nothing
}
virtual ~DeconvDepthWiseExecution() {

View File

@ -7,55 +7,52 @@
//
#include "ConvSingleInputExecution.hpp"
#include "Raster.cuh"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
// 16 / sizeof(int4)
namespace MNN {
namespace CUDA {
__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const float* A,
__half* AP) {
int eAlign = matmulParam->elhPack[0] * MATMULPACK;
int lAlign = matmulParam->elhPack[1] * MATMULPACK;
int maxCount = eAlign * lAlign;
int kernelCount = param->kernelX * param->kernelY;
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
int eIndex = index % eAlign;
int lIndex = index / eAlign;
// Compute for dest
int eU = eIndex / MATMULPACK;
int eR = eIndex % MATMULPACK;
int lU = lIndex / MATMULPACK;
int lR = lIndex % MATMULPACK;
auto dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lU * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR;
if (eIndex >= matmulParam->elh[0] || lIndex >= matmulParam->elh[1]) {
AP[dstOffset] = 0.0;
__global__ void KernelReorder(const float* B, half* BP, int kw, int kh, int ic, int oc, int ocPack) {
int icC4 = UP_DIV(ic, PACK_NUMBER);
int kernelCount = kw * kh;
int l = icC4 * kernelCount * PACK_NUMBER;
int h = oc;
int lDiv = UP_DIV(l, MATMULPACK);
int lAlign = lDiv * MATMULPACK;
int hAlign = UP_DIV(h, ocPack) * ocPack;
int maxCount = hAlign * lAlign;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int lR = indexO % MATMULPACK;
int tmp = indexO / MATMULPACK;
int hR = tmp % ocPack;
int tmp2 = tmp / ocPack;
int lC = tmp2 % lDiv;
int hC = tmp2 / lDiv;
half* dst = BP + indexO;
int sH = hC * ocPack + hR;
int sL = lC * MATMULPACK + lR;
if (sH >= oc) {
*dst = 0.0;
continue;
}
// Compute for source
int ox = eIndex % param->ow;
int oy = eIndex / param->ow;
int ob = oy / param->oh;
oy = oy % param->oh;
int sz = lIndex / kernelCount;
int kI = lIndex % kernelCount;
int ksx = kI % param->kernelX;
int ksy = kI / param->kernelX;
int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
__half value = A[sz * param->ih * param->iw + ob * param->iw * param->ih * param->icDiv4 + sy * param->iw + sx];
AP[dstOffset] = value;
continue;
}
int sLR = sL % PACK_NUMBER;
int sLC = sL / PACK_NUMBER;
int iLC = sLC / (kernelCount);
int ik = sLC % kernelCount;
int iz = iLC * PACK_NUMBER + sLR;
if (iz >= ic) {
*dst = 0.0;
continue;
}
AP[dstOffset] = 0.0;
const float* src = B + sH * kernelCount * ic + ik + iz * kernelCount;
*dst = *src;
}
}
ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
mBackend = bn;
auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
@ -78,40 +75,91 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
mKernelInfo.kernelN = common->outputCount();
mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
int icDiv = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER);
MatMulParam param;
int e = 0;
int l = mKernelInfo.kernelX * mKernelInfo.kernelY * mKernelInfo.kernelC;
int l = mKernelInfo.kernelX * mKernelInfo.kernelY * icDiv * MATMULPACK;
int h = mKernelInfo.kernelN;
param.elh[0] = e;
param.elh[1] = l;
param.elh[2] = h;
param.elhPack[0] = UP_DIV(e, 16);
param.elhPack[1] = UP_DIV(l, 16);
param.elhPack[2] = UP_DIV(h, 16);
param.elhPack[0] = UP_DIV(e, MATMULPACK);
param.elhPack[1] = UP_DIV(l, MATMULPACK);
param.elhPack[2] = UP_DIV(h, MATMULPACK);
param.bStride[0] = 0;
param.bStride[1] = 1;
param.bStride[2] = l;
auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
FuseRegion reg;
int maxOffsetNumber = 8;
std::vector<int> offset(maxOffsetNumber);
auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber);
auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
// Reorder weight
weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
mFilter = (void *)weightTensor.get()->buffer().device;
GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), nullptr, nullptr, cacheWeight, (__half*)mFilter);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);
{
auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
mFilter = (void *)weightTensor.get()->buffer().device;
auto& prop = runtime->prop();
int cores = prop.multiProcessorCount;
int threadNumbers = prop.maxThreadsPerBlock;
if (param.elhPack[2] % 2 == 0) {
KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, 32);
mUsePack = true;
} else {
KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, MATMULPACK);
}
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
}
// Copy Bias
int biasSize = conv->bias()->size();
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
// FP32 -> FP16
mBias = (void *)biasTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
int alignSize = UP_DIV(conv->bias()->size(), PACK_NUMBER) * PACK_NUMBER;
reg.size[0] = 1;
reg.size[1] = 1;
reg.size[2] = alignSize;
reg.srcStride[0] = 0;
reg.srcStride[1] = 0;
reg.srcStride[2] = 1;
reg.dstStride[0] = 0;
reg.dstStride[1] = 0;
reg.dstStride[2] = 1;
offset[0] = 1;
offset[1] = 1;
offset[2] = conv->bias()->size();
offset[3] = 0;
offset[4] = 1;
offset[5] = 1;
offset[6] = reg.size[2];
offset[7] = 0;
reg.fuseNumber = 1;
runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
runtime->memcpy(offsetGpu, offset.data(), 8 * sizeof(int), MNNMemcpyHostToDevice, true);
if (static_cast<CUDABackend*>(bn)->useFp16()) {
FuseRasterBlitFloatToHalf((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
} else {
FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4);
}
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
}
ConvSingleInputExecution::Resource::~Resource() {
@ -146,14 +194,16 @@ bool ConvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** ds
ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto input = inputs[0], output = outputs[0];
const int UNIT = 1;
const int UNIT = PACK_NUMBER;
auto convCommon = mOp->main_as_Convolution2D()->common();
auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common());
int ic = input->channel();
int icDiv = UP_DIV(ic, PACK_NUMBER);
mIm2ColParamter.dilateX = convCommon->dilateX();
mIm2ColParamter.dilateY = convCommon->dilateY();
mIm2ColParamter.strideX = convCommon->strideX();
mIm2ColParamter.strideY = convCommon->strideY();
mIm2ColParamter.icDiv4 = input->channel();
mIm2ColParamter.icDiv4 = icDiv;
mIm2ColParamter.kernelX = convCommon->kernelX();
mIm2ColParamter.kernelY = convCommon->kernelY();
mIm2ColParamter.padX = std::get<0>(pads);
@ -169,21 +219,21 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice);
//MNN_PRINT("conv size:%d-%d-%d, %d-%d-%d\n", input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
int e = output->height() * output->width() * output->batch();
int l = input->channel() * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY;
int l = icDiv * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY * MATMULPACK;
int h = output->channel();
mMatMulParam.elh[0] = e;
mMatMulParam.elh[1] = l;
mMatMulParam.elh[2] = h;
mMatMulParam.elhPack[0] = UP_DIV(e, 16);
mMatMulParam.elhPack[1] = UP_DIV(l, 16);
mMatMulParam.elhPack[2] = UP_DIV(h, 16);
mMatMulParam.elhPack[0] = UP_DIV(e, MATMULPACK);
mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK);
mMatMulParam.elhPack[2] = UP_DIV(h, MATMULPACK);
mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h;
mMatMulParam.cStride[1] = 1;
mMatMulParam.cStride[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
mMatMulParam.split[0] = 1;
mMatMulParam.split[1] = 1;
mMatMulParam.split[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
mMatMulParam.minValue = -FLT_MAX;
mMatMulParam.maxValue = FLT_MAX;
if (convCommon->relu()) {
mMatMulParam.minValue = 0.0f;
}
@ -191,12 +241,14 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
mMatMulParam.minValue = 0.0f;
mMatMulParam.maxValue = 6.0f;
}
//MNN_PRINT("Im2Col temp size:%d!!!\n\n", mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
auto buffer = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)MATMULPACK * (size_t)MATMULPACK);
mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second);
pool->free(buffer);
return NO_ERROR;
}
@ -204,21 +256,28 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
//MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
auto input = inputs[0];
auto output = outputs[0];
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
const void *input_addr = (const void*)inputs[0]->deviceId();
const void *filter_addr = mResource->mFilter;
const void *bias_addr = mResource->mBias;
auto bn = backend();
void *output_addr = (void*)outputs[0]->deviceId();
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int cores = prop.multiProcessorCount;
auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second);
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
//runtime->memset(mIm2ColBuffer, 0, mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * sizeof(__half) * (MATMULPACK * MATMULPACK));
Im2Col<<<cores, threads_num>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const float*)bias_addr);
// Im2Col func
Im2ColMain(runtime, &mMatMulParam, gpuMatMul, &mIm2ColParamter, gpuIm2Col, (const float*)input_addr, mIm2ColBuffer, bytes);
if (mResource->mUsePack) {
GemmPacked16x32(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
} else {
//printf("NotPack:%d-%d-%d-%d-%d, %d-%d-%d\n", mIm2ColParamter.icDiv4, mIm2ColParamter.ih, mIm2ColParamter.iw, mIm2ColParamter.oh, mIm2ColParamter.ow, mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]);
GemmPackedFullMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
}
return NO_ERROR;
}

View File

@ -11,7 +11,9 @@
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include "TensorCoreGemm.cuh"
#include "TensorCoreGemmPacked.cuh"
#include "ImageColumn.cuh"
namespace MNN {
namespace CUDA {
@ -40,6 +42,7 @@ public:
std::shared_ptr<Tensor> biasTensor;
KernelInfo mKernelInfo;
Backend* mBackend = nullptr;
bool mUsePack = false;
};
ConvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
virtual ~ConvSingleInputExecution();
@ -58,6 +61,7 @@ private:
std::pair<void*, int> mGpuIm2ColParam;
__half* mIm2ColBuffer;
std::pair<void*, int> mGpuKernelParam;
};
} // namespace CUDA

View File

@ -11,263 +11,302 @@
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void cutPad(const size_t size, const T* input, const int old_height,
const int old_width, const int height, const int width, const int pad_top,
const int pad_left, T* output) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
int block_num = pos / (width*height);
int left = pos % (width*height);
const int out_w = left % width;
const int out_h = left / width % height;
__global__ void DeconvInputRerange(const int count,
const InputReorderParameter* param,
const float* Inp,
__half* InpRe
) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
int l = param->l_size;
int h = param->h_size;
int lIndex = i % l;
int hIndex = i / l;
int lU = lIndex / 16;
int lR = lIndex % 16;
int hU = hIndex / 16;
int hR = hIndex % 16;
output[pos] = input[(block_num * old_height + out_h + pad_top) * old_width + out_w + pad_left];
int bIndex = hIndex / param->hw_size;
int hwIndex = hIndex % param->hw_size;
float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex];
//inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value;
//__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16;
__half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16;
dst[0] = value;
}
return;
}
DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op) : Execution(backend), mOp(op) {
//MNN_PRINT("cuda DeconvSingleInput onInit in\n");
template <typename Dtype>
__global__ void Col2Im(const int n, const Dtype* data_col,
const int batch, const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int dilation_h, const int dilation_w,
const int height_col, const int width_col,
const Dtype* bias, Dtype* data_im) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) {
Dtype val = 0;
const int b_im = index / (channels * width * height);
const int chw = index % (channels * width * height);
const int w_im = chw % width + pad_w;
const int h_im = (chw / width) % height + pad_h;
const int c_im = chw / (width * height);
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const int w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
if(nullptr != bias) {
val += bias[c_im];
}
data_im[index] = val;
}
}
DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
mBackend = bn;
auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
auto conv = op->main_as_Convolution2D();
auto common = conv->common();
mKernelInfo.groups = common->group();
mKernelInfo.kernelX = common->kernelX();
mKernelInfo.kernelY = common->kernelY();
mKernelInfo.padMode = common->padMode();
mKernelInfo.padX = common->padX();
mKernelInfo.padY = common->padY();
if (nullptr != common->pads()) {
mKernelInfo.padX = common->pads()->data()[1];
mKernelInfo.padY = common->pads()->data()[0];
}
pad_left_ = mKernelInfo.padX;
pad_right_ = mKernelInfo.padX;
pad_top_ = mKernelInfo.padY;
pad_bottom_ = mKernelInfo.padY;
mKernelInfo.groups = common->group();
mKernelInfo.strideX = common->strideX();
mKernelInfo.strideY = common->strideY();
mKernelInfo.dilateX = common->dilateX();
mKernelInfo.dilateY = common->dilateY();
mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);
use_relu_ = (mKernelInfo.activationType == 1);
use_relu6_ = (mKernelInfo.activationType == 2);
cudnn_handle_ = nullptr;
input_desc_ = nullptr;
output_desc_ = nullptr;
filter_desc_ = nullptr;
conv_desc_ = nullptr;
padded_desc_ = nullptr;
cudnn_data_type_ = CUDNN_DATA_FLOAT;
cudnn_data_type_len_ = 0;
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
cudnn_handle_ = runtime->cudnn_handle();
cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
cudnn_check(cudnnCreateTensorDescriptor(&bias_desc_));
cudnn_check(cudnnCreateFilterDescriptor(&filter_desc_));
cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc_));
cudnn_check(cudnnCreateActivationDescriptor(&act_desc_));
//weight host->device
const float* filterDataPtr = nullptr;
int weightSize = 0;
std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
weightTensor.reset(Tensor::createDevice<float>({weightSize}));
backend->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
mKernelInfo.kernelN = common->outputCount();
mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
MatMulParam param;
int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY;
int l = mKernelInfo.kernelC;
int h = 0;
param.elh[0] = e;
param.elh[1] = l;
param.elh[2] = h;
param.elhPack[0] = UP_DIV(e, 16);
param.elhPack[1] = UP_DIV(l, 16);
param.elhPack[2] = UP_DIV(h, 16);
param.aStride[0] = 1;
param.aStride[1] = e;
param.aStride[2] = 0;
auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
// Reorder weight
weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)}));
bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
mFilter = (void *)weightTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mFilter, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);
// Copy Bias
int biasSize = conv->bias()->size();
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
mBias = (void *)biasTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
}
if(conv->bias()->size() != 0) {
int biasSize = conv->bias()->size();
biasTensor.reset(Tensor::createDevice<float>({biasSize}));
backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
mBias = (void *)biasTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
int bias_size = conv->bias()->size();
int dim_bias[] = {1, bias_size, 1, 1};
int stride_bias[] = {bias_size, 1, 1, 1};
if(cudnn_data_type_ == CUDNN_DATA_FLOAT) {
cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias));
}
else if(cudnn_data_type_ == CUDNN_DATA_HALF) {
cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_HALF, 4, dim_bias, stride_bias));
} else {
MNN_PRINT("only supports fp32/fp16 data type!!!\n");
}
use_bias_ = true;
}
DeconvSingleInputExecution::Resource::~Resource() {
// Do nothing
}
DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : Execution(backend), mOp(op) {
mResource = res;
auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam));
mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter));
mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter));
}
DeconvSingleInputExecution::~DeconvSingleInputExecution() {
cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(output_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(input_desc_));
cudnn_check(cudnnDestroyTensorDescriptor(bias_desc_));
cudnn_check(cudnnDestroyActivationDescriptor(act_desc_));
auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
staticPool->free(mGpuMatMulParam);
staticPool->free(mGpuCol2ImParam);
staticPool->free(mGpuInpReorderParam);
}
bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) {
if (!mValid) {
return false;
}
if (nullptr == dst) {
return true;
}
auto dstExe = new DeconvSingleInputExecution(bn, op, mResource);
*dst = dstExe;
return true;
}
ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
// prepare
//MNN_PRINT("cuda DeconvSingleInput onResize in, pad:%d\n", mKernelInfo.padX);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto input = inputs[0], output = outputs[0];
const int UNIT = 1;
auto convCommon = mOp->main_as_Convolution2D()->common();
mIOInfo.iw = input->width();
mIOInfo.ih = input->height();
mIOInfo.ic = input->channel();
mIOInfo.ib = input->batch();
mIOInfo.ow = output->width();
mIOInfo.oh = output->height();
mIOInfo.oc = output->channel();
mIOInfo.ob = output->batch();
// Input Rerange Param
mInpReorderParameter.hw_size = input->height() * input->width();
mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size;
mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel();
mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride;
mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size;
mInpReorderParameter.l_size = input->channel();
mInpReorderParameter.h_size = input->batch() * mInpReorderParameter.hw_size;
mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16);
mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16);
mKernelInfo.kernelN = output->channel();
mKernelInfo.kernelC = input->channel() / mKernelInfo.groups;
runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice);
std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
std::vector<int> output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
std::vector<int> filter_shape = {mKernelInfo.kernelC, mKernelInfo.kernelN, mKernelInfo.kernelY, mKernelInfo.kernelX};//deconv (ic oc kh kw)
// printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]);
// printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]);
// printf("output:%d %d %d %d\n", output_shape[0], output_shape[1], output_shape[2], output_shape[3]);
cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0],
in_shape[1], in_shape[2], in_shape[3]));
cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
filter_shape[1], filter_shape[2], filter_shape[3]));
cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0],
output_shape[1], output_shape[2], output_shape[3]));
// Col2Im Param
auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());
mCol2ImParamter.dilateX = convCommon->dilateX();
mCol2ImParamter.dilateY = convCommon->dilateY();
mCol2ImParamter.strideX = convCommon->strideX();
mCol2ImParamter.strideY = convCommon->strideY();
mCol2ImParamter.ic = input->channel();
mCol2ImParamter.oc = output->channel();
mCol2ImParamter.kernelX = convCommon->kernelX();
mCol2ImParamter.kernelY = convCommon->kernelY();
mCol2ImParamter.padX = pad.first;
mCol2ImParamter.padY = pad.second;
mCol2ImParamter.ih = input->height();
mCol2ImParamter.iw = input->width();
mCol2ImParamter.oh = output->height();
mCol2ImParamter.ow = output->width();
mCol2ImParamter.ob = output->batch();
runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice);
// Matmul Param
int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY;
int l = input->channel();
int h = input->height() * input->width() * output->batch();
mMatMulParam.elh[0] = e;
mMatMulParam.elh[1] = l;
mMatMulParam.elh[2] = h;
mMatMulParam.elhPack[0] = UP_DIV(e, 16);
mMatMulParam.elhPack[1] = UP_DIV(l, 16);
mMatMulParam.elhPack[2] = UP_DIV(h, 16);
mMatMulParam.bStride[0] = 0;
mMatMulParam.bStride[1] = input->height() * input->width();
mMatMulParam.bStride[2] = 1;
mMatMulParam.cStride[0] = h;
mMatMulParam.cStride[1] = 1;
mMatMulParam.cStride[2] = 1;
if (convCommon->relu()) {
mMatMulParam.minValue = 0.0f;
}
if (convCommon->relu6()) {
mMatMulParam.minValue = 0.0f;
mMatMulParam.maxValue = 6.0f;
}
runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);
// Alloc temp cuda memory
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]);
auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK);
cudnnTensorDescriptor_t input_descriptor_real = nullptr;
mIm2ColBuffer = (float*)((uint8_t*)buffer1.first + buffer1.second);
mInputBuffer = (__half*)((uint8_t*)buffer2.first + buffer2.second);
if (mKernelInfo.padMode == PadMode_SAME) {
int kernelWidthSize = (mKernelInfo.kernelX - 1) * mKernelInfo.dilateX + 1;
int kernelHeightSize = (mKernelInfo.kernelY - 1) * mKernelInfo.dilateY + 1;
int pw = (mIOInfo.iw - 1) * mKernelInfo.strideX + kernelWidthSize - mIOInfo.ow;
int ph = (mIOInfo.ih - 1) * mKernelInfo.strideY + kernelHeightSize - mIOInfo.oh;
pad_left_ = pw/2;
pad_right_ = pw - pad_left_;
pad_top_ = ph/2;
pad_bottom_ = ph - pad_top_;
}
pool->free(buffer2);
pool->free(buffer1);
use_pad_ = (pad_left_!=0 || pad_right_!=0 || pad_top_!=0 || pad_bottom_!=0 ) ? true : false;
if(use_pad_) {
int totalSize = output_shape[0]*output_shape[1]*(output_shape[2]+pad_top_+pad_bottom_)*(output_shape[3]+pad_left_+pad_right_);
padTensor.reset(Tensor::createDevice<float>({totalSize}));
backend()->onAcquireBuffer(padTensor.get(), Backend::DYNAMIC);
mPadPtr = (void *)padTensor.get()->buffer().device;
//dynamic memory release
backend()->onReleaseBuffer(padTensor.get(), Backend::DYNAMIC);
cudnn_check(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0], output_shape[1],
output_shape[2] + +pad_top_+pad_bottom_, output_shape[3] + pad_left_+pad_right_));
}
input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX,
mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
if (cudnn_data_type_ == CUDNN_DATA_HALF) {
cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
}
//set group num
cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
// algorithm
constexpr int requested_algo_count = 1;
int returned_algo_count;
cudnnConvolutionBwdDataAlgoPerf_t perf_results;
cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_,
output_desc_, requested_algo_count, &returned_algo_count, &perf_results));
conv_bwd_algo_ = perf_results.algo;
// workspace
cudnn_check(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_, output_desc_,
conv_bwd_algo_, &workspace_size_));
if (workspace_size_ != 0) {
int workspaceSize = workspace_size_;
workspaceTensor.reset(Tensor::createDevice<float>({workspaceSize}));
//cudnn not support workspace memory reuse
backend()->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
mWorkSpace = (void *)workspaceTensor.get()->buffer().device;
}
if(use_relu_) {
cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
} else if(use_relu6_) {
cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_NOT_PROPAGATE_NAN, 6.0));
} else {
//do nothing
}
//MNN_PRINT("cuda DeconvSingleInput onResize out\n");
return NO_ERROR;
}
ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
//MNN_PRINT("cuda DeconvSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
//MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
const void *input_addr = (const void*)inputs[0]->deviceId();
const void *filter_addr = mFilter;
const void *bias_addr = mBias;
const void *filter_addr = mResource->mFilter;
const void *bias_addr = mResource->mBias;
void *output_addr = (void*)outputs[0]->deviceId();
void *workspace_addr = nullptr;
if (workspace_size_ != 0) {
workspace_addr = mWorkSpace;
}
const float alpha = 1;
const float beta = 0;
auto gpuInpReorder = (const InputReorderParameter*)((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second);
auto gpuCol2Im = (const Col2ImParameter*)((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second);
auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch();
int inp_block_num = runtime->blocks_num(rerangeCount);
int inp_thread_num = runtime->threads_num();
if(use_pad_) {
cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
conv_bwd_algo_, workspace_addr, workspace_size_, &beta, padded_desc_, mPadPtr));
// Do input Rerange
runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half));
DeconvInputRerange<<<inp_block_num, inp_thread_num>>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer);
std::vector<int> out_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
// Do Gemm operation
GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)mIm2ColBuffer, (const half*)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false);
int size = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3];
int block_num = runtime->blocks_num(size);
int threads_num = runtime->threads_num();
// Do Col2Im trans
int height_col = mCol2ImParamter.ih;
int width_col = mCol2ImParamter.iw;
int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow;
cutPad<<<block_num, threads_num>>>(size, (float*)mPadPtr, out_shape[2]+pad_top_+pad_bottom_, out_shape[3]+pad_left_+pad_right_,
out_shape[2], out_shape[3], pad_top_, pad_left_, (float*)output_addr);
}
else {
cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
conv_bwd_algo_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr));
}
int col2im_block_num = runtime->blocks_num(num_kernels);
int col2im_thread_num = runtime->threads_num();
// printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \
// mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \
// mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \
// col2im_block_num, col2im_thread_num);
Col2Im<float><<<col2im_block_num, col2im_thread_num>>>(
num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc,
mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX,
mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX,
height_col, width_col, (const float*)bias_addr, (float *)output_addr);
if(use_bias_) {
cudnn_check(cudnnAddTensor(cudnn_handle_, &alpha, bias_desc_, bias_addr, &alpha, output_desc_, output_addr));
}
if(use_relu_ || use_relu6_) {
cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr));
}
return NO_ERROR;
}
@ -287,7 +326,8 @@ public:
MNN_PRINT("Deconv inputs size:3 not support\n");
return nullptr;
} else if(inputs.size() == 1) {
return new DeconvSingleInputExecution(backend, op);
std::shared_ptr<DeconvSingleInputExecution::Resource> resource(new DeconvSingleInputExecution::Resource(backend, op));
return new DeconvSingleInputExecution(backend, op, resource);
} else {
MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size());
return nullptr;
@ -295,7 +335,7 @@ public:
}
};
CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);
//CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);
}// namespace CUDA
}// namespace MNN

View File

@ -11,7 +11,7 @@
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include "half.hpp"
#include "TensorCoreGemm.cuh"
namespace MNN {
namespace CUDA {
@ -26,9 +26,6 @@ struct KernelInfo {
int kernelC = 0;
int kernelX = 0;
int kernelY = 0;
PadMode padMode = PadMode_CAFFE;
int padX = 0;
int padY = 0;
int strideX = 0;
int strideY = 0;
int dilateX = 0;
@ -36,59 +33,71 @@ struct KernelInfo {
int activationType = 0;
};//
struct Col2ImParameter {
int padX;
int padY;
int dilateX;
int dilateY;
int strideX;
int strideY;
int kernelX;
int kernelY;
int oc;
int ic;
int iw;
int ih;
int ow;
int oh;
int ob;
};
struct InputReorderParameter {
int ic_stride;
int ib_stride;
int oc_stride;
int ob_stride;
int hw_size;
int l_size;
int h_size;
int lpack_size;
int hpack_size;
};
extern "C"
class DeconvSingleInputExecution : public Execution {
public:
DeconvSingleInputExecution(Backend* backend, const MNN::Op* op);
struct Resource {
Resource(Backend* bn, const MNN::Op* op);
~ Resource();
void* mFilter;
void* mBias;
std::shared_ptr<Tensor> weightTensor;
std::shared_ptr<Tensor> biasTensor;
KernelInfo mKernelInfo;
Backend* mBackend = nullptr;
};
DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
virtual ~DeconvSingleInputExecution();
virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
private:
cudnnHandle_t cudnn_handle_;
cudnnTensorDescriptor_t input_desc_;
cudnnTensorDescriptor_t output_desc_;
cudnnFilterDescriptor_t filter_desc_;
cudnnConvolutionBwdDataAlgo_t conv_bwd_algo_;
cudnnConvolutionDescriptor_t conv_desc_;
cudnnTensorDescriptor_t bias_desc_;
cudnnTensorDescriptor_t padded_desc_;
cudnnActivationDescriptor_t act_desc_;
std::shared_ptr<Resource> mResource;
cudnnDataType_t cudnn_data_type_;
int cudnn_data_type_len_;
bool use_pad_ = false;
int pad_top_ = 0;
int pad_bottom_ = 0;
int pad_left_ = 0;
int pad_right_ = 0;
const Op* mOp = nullptr;
MatMulParam mMatMulParam;
std::pair<void*, int> mGpuMatMulParam;
bool use_bias_ = false;
bool use_relu_ = false;
bool use_relu6_ = false;
Col2ImParameter mCol2ImParamter;
std::pair<void*, int> mGpuCol2ImParam;
void* mPadPtr;
void* mFilter;
void* mBias;
void* mWorkSpace;
std::shared_ptr<Tensor> weightTensor;
std::shared_ptr<Tensor> biasTensor;
std::shared_ptr<Tensor> padTensor;
std::shared_ptr<Tensor> workspaceTensor;
InputReorderParameter mInpReorderParameter;
std::pair<void*, int> mGpuInpReorderParam;
std::shared_ptr<Tensor> mPad;
std::shared_ptr<Tensor> mWorkspaceForward;
size_t input_size_;
size_t filter_size_;
size_t output_size_;
size_t padded_size_;
size_t workspace_size_;
const MNN::Op* mOp;
KernelInfo mKernelInfo;
IOInfo mIOInfo;
std::shared_ptr<Tensor> mTempInput;
float* mIm2ColBuffer;
__half* mInputBuffer;
};
} // namespace CUDA

View File

@ -0,0 +1,705 @@
#include "ImageColumn.cuh"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
#include "Raster.cuh"
#define BLOCK_INT4 2
namespace MNN {
namespace CUDA {
__global__ void Im2Col1x1(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const float* A,
half* AP,
DivModFast eAlignD,
DivModFast owD,
DivModFast ohD
) {
int eAlign = matmulParam->elhPack[0] * MATMULPACK;
int lAlign = matmulParam->elhPack[1];
int maxCount = eAlign * lAlign * BLOCK_INT4;
int kernelCount = 1;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int index = indexO >> 1;
int lR = indexO & 1;
int eIndex, lIndex;
eAlignD.divmod(index, lIndex, eIndex);
int eU = eIndex >> 4;
int eR = eIndex & 15;
int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
int4* dst = (int4*)(AP + dstOffset);
if (eIndex >= matmulParam->elh[0]) {
*dst = {0, 0, 0, 0};
continue;
}
// Compute for source
int ox, oy, ob;
owD.divmod(eIndex, oy, ox);
ohD.divmod(oy, ob, oy);
int sz = lIndex;
int sx = ox * param->strideX - param->padX;
int sy = oy * param->strideY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
float2* srcF = (float2*)(A + offset);
half2* dstH = (half2*)dst;
dstH[0] = __float22half2_rn(srcF[0]);
dstH[1] = __float22half2_rn(srcF[1]);
dstH[2] = __float22half2_rn(srcF[2]);
dstH[3] = __float22half2_rn(srcF[3]);
continue;
}
}
*dst = {0, 0, 0, 0};
}
}
__global__ void Im2Col1x1_OPT(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const int maxCount,
const float* A,
half* AP,
DivModFast eAlignD,
DivModFast owD,
DivModFast ohD
) {
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int index = indexO >> 3;
int lR = indexO & 7;
int eIndex, lIndex;
eAlignD.divmod(index, lIndex, eIndex);
int eU = eIndex >> 4;
int eR = eIndex & 15;
int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
float2* srcF = (float2*)(A + offset);
half2* dstH = (half2*)(AP + dstOffset);
dstH[0] = __float22half2_rn(srcF[0]);
}
}
__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const float* A,
half* AP) {
int eAlign = matmulParam->elhPack[0] * MATMULPACK;
int lAlign = matmulParam->elhPack[1];
int maxCount = eAlign * lAlign * BLOCK_INT4;
int kernelCount = param->kernelX * param->kernelY;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int index = indexO / BLOCK_INT4;
int lR = indexO % BLOCK_INT4;
int eIndex = index % eAlign;
int lIndex = index / eAlign;
int eU = eIndex / MATMULPACK;
int eR = eIndex % MATMULPACK;
int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
int4* dst = (int4*)(AP + dstOffset);
if (eIndex >= matmulParam->elh[0]) {
*dst = {0, 0, 0, 0};
continue;
}
// Compute for source
int ox = eIndex % param->ow;
int oy = eIndex / param->ow;
int ob = oy / param->oh;
oy = oy % param->oh;
int sz = lIndex / kernelCount;
int kI = lIndex % kernelCount;
int ksx = kI % param->kernelX;
int ksy = kI / param->kernelX;
int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
float2* srcF = (float2*)(A + offset);
half2* dstH = (half2*)dst;
dstH[0] = __float22half2_rn(srcF[0]);
dstH[1] = __float22half2_rn(srcF[1]);
dstH[2] = __float22half2_rn(srcF[2]);
dstH[3] = __float22half2_rn(srcF[3]);
continue;
}
}
*dst = {0, 0, 0, 0};
}
}
__global__ void Im2Col1x1_half(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const half* A,
half* AP,
DivModFast eAlignD,
DivModFast owD,
DivModFast ohD
) {
int eAlign = matmulParam->elhPack[0] * MATMULPACK;
int lAlign = matmulParam->elhPack[1];
int maxCount = eAlign * lAlign * BLOCK_INT4;
int kernelCount = 1;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int index = indexO / BLOCK_INT4;
int lR = indexO % BLOCK_INT4;
int eIndex, lIndex;
eAlignD.divmod(index, lIndex, eIndex);
int eU = eIndex / MATMULPACK;
int eR = eIndex % MATMULPACK;
int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
int4* dst = (int4*)(AP + dstOffset);
if (eIndex >= matmulParam->elh[0]) {
*dst = {0, 0, 0, 0};
continue;
}
// Compute for source
int ox, oy, ob;
owD.divmod(eIndex, oy, ox);
ohD.divmod(oy, ob, oy);
int sz = lIndex;
int sx = ox * param->strideX - param->padX;
int sy = oy * param->strideY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
int4* src = (int4*)(A + offset);
*dst = *src;
continue;
}
}
*dst = {0, 0, 0, 0};
}
}
__global__ void Im2Col1x1_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const int maxCount,
const half* A,
half* AP,
DivModFast eAlignD,
DivModFast owD,
DivModFast ohD
) {
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
int index = indexO >> 3;
int lR = indexO & 7;
int eIndex, lIndex;
eAlignD.divmod(index, lIndex, eIndex);
int eU = eIndex >> 4;
int eR = eIndex & 15;
int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
int* srcF = (int*)(A + offset);
int* dstH = (int*)(AP + dstOffset);
dstH[0] = srcF[0];
}
}
__global__ void Im2Col_half(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const int maxCount,
const half* A,
half* AP,
DivModFast d_eA,
DivModFast d_ow,
DivModFast d_oh,
DivModFast d_fxy,
DivModFast d_fx
) {
int eAlign = matmulParam->elhPack[0] << 4;
int lAlign = matmulParam->elhPack[1];
int kernelCount = param->kernelX * param->kernelY;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
size_t index = indexO >> 1;
size_t lR = indexO & 1;
int eIndex, lIndex;
d_eA.divmod(index, lIndex, eIndex);
size_t eU = eIndex >> 4;
size_t eR = eIndex & 15;
size_t dstOffset = ((((eU * matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 3);
int4* dst = (int4*)(AP + dstOffset);
if (eIndex >= matmulParam->elh[0]) {
*dst = {0, 0, 0, 0};
continue;
}
// Compute for source
int ox, oby, ob, oy, sz, kI, ksx, ksy;
d_ow.divmod(eIndex, oby, ox);
d_oh.divmod(oby, ob, oy);
d_fxy.divmod(lIndex, sz, kI);
d_fx.divmod(kI, ksy, ksx);
size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + lR * 8;
int4* src = (int4*)(A + offset);
*dst = *src;
continue;
}
}
*dst = {0, 0, 0, 0};
}
}
__global__ void Im2Col_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const size_t maxCount,
const half* A,
half* AP,
DivModFast d_eA,
DivModFast d_ow,
DivModFast d_oh,
DivModFast d_fxy,
DivModFast d_fx
) {
size_t eAlign = matmulParam->elhPack[0] << 4;
size_t lAlign = matmulParam->elhPack[1];
size_t kernelCount = param->kernelX * param->kernelY;
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
size_t index = indexO >> 2;
size_t lR = indexO & 3;
int eIndex, lIndex;
d_eA.divmod(index, lIndex, eIndex);
size_t eU = eIndex >> 4;
size_t eR = eIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 2);
int2* dst = (int2*)(AP + dstOffset);
if (eIndex >= matmulParam->elh[0]) {
*dst = {0, 0};
continue;
}
// Compute for source
int ox, oby, ob, oy, sz, kI, ksx, ksy;
d_ow.divmod(eIndex, oby, ox);
d_oh.divmod(oby, ob, oy);
d_fxy.divmod(lIndex, sz, kI);
d_fx.divmod(kI, ksy, ksx);
size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
if (sx >= 0 && sx < param->iw) {
if (sy >=0 && sy < param->ih) {
size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + (lR << 2);
int2* src = (int2*)(A + offset);
*dst = *src;
continue;
}
}
*dst = {0, 0};
}
}
__global__ void Im2Col_half_3x3S1D1P1_OPT2(const ConvolutionCommon::Im2ColParameter* param,
const MatMulParam* matmulParam,
const size_t maxCount,
const half* A,
half* AP,
DivModFast d_eA,
DivModFast d_ow,
DivModFast d_oh
) {
for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
size_t index = indexO >> 3;
size_t lR = indexO & 7;
int eIndex, lIndex;
d_eA.divmod(index, lIndex, eIndex);
int ix, oby, ob, iy;
d_ow.divmod(eIndex, oby, ix);
d_oh.divmod(oby, ob, iy);
size_t sz = lIndex;
size_t offset = sz * param->srcZStep + (((ob * param->ih + iy) * param->iw + ix) << 4) + (lR << 1);
int src = *((int*)(A + offset));
// Pixel (iy-1, ix-1)
if(iy-1 >=0 && ix-1 >=0) {
size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix-1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 8) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy-1 ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix-1 ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy-1, ix+0)
if(iy-1 >=0) {
size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+0));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 7) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy-1 ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy-1, ix+1)
if(iy-1 >=0 && ix+1 < param->iw) {
size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 6) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy-1 ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix+1 == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy+0, ix-1)
if(ix-1 >=0) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix-1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 5) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(iy == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix-1 ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy, ix)
if(1) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+0));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 4) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(iy == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy, ix+1)
if(ix+1 < param->iw) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 3) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy ==0) {
size_t index[3] = {0, 1, 2};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(iy == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix+1 == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy+1, ix-1)
if(iy+1 < param->ih && ix-1 >=0) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix-1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 2) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy+1 == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix-1 ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
// Pixel (iy+1, ix)
if(iy+1 < param->ih) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+0));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 1) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy+1 == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix ==0) {
size_t index[3] = {0, 3, 6};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
//Pixel (iy+1, ix+1)
if(iy+1 < param->ih && ix+1 < param->iw) {
size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+1));
size_t eU = oeIndex >> 4;
size_t eR = oeIndex & 15;
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 0) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = src;
// Corner case
if(iy+1 == param->ih-1) {
size_t index[3] = {6, 7, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
if(ix+1 == param->iw-1) {
size_t index[3] = {2, 5, 8};
for(size_t i=0; i<3; i++) {
size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
int* dst = (int*)(AP + dstOffset);
*dst = 0;
}
}
}
}
}
void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col,\
const void* input_addr, __half* mIm2ColBuffer, int bytes) {
size_t eAlign = cpuMatlMul->elhPack[0] * MATMULPACK;
size_t lAlign = cpuMatlMul->elhPack[1];
DivModFast eAlignD(eAlign);
DivModFast owD(cpuIm2Col->ow);
DivModFast ohD(cpuIm2Col->oh);
if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1 && \
cpuMatlMul->elh[0] % 16 == 0 && \
cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
cpuIm2Col->padX == 0 && cpuIm2Col->padY == 0) {
size_t maxCount = eAlign * lAlign * 8;//Align 2
int block_num = runtime->blocks_num(maxCount);
int block_size = runtime->threads_num();
if(bytes == 4) {
Im2Col1x1_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
(const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
checkKernelErrors;
} else {
Im2Col1x1_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
(const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
checkKernelErrors;
}
} else if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1) {
size_t maxCount = eAlign * lAlign * 2;//Align 8
int block_num = runtime->blocks_num(maxCount);
int block_size = runtime->threads_num();
if(bytes == 4) {
Im2Col1x1<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
checkKernelErrors;
} else {
Im2Col1x1_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
checkKernelErrors;
}
} else if(cpuIm2Col->kernelX == 3 && cpuIm2Col->kernelY == 3 && \
cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
cpuIm2Col->padX == 1 && cpuIm2Col->padY == 1 && \
bytes == 2) {
size_t maxCount = eAlign * (lAlign / 9) * 8;
size_t block_num = runtime->blocks_num(maxCount);
size_t block_size = runtime->threads_num();
//printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
Im2Col_half_3x3S1D1P1_OPT2<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,\
eAlignD, owD, ohD);
checkKernelErrors;
} else {
size_t maxCount = eAlign * lAlign * 2;
size_t block_num = runtime->blocks_num(maxCount);
size_t block_size = runtime->threads_num();
if(bytes == 4) {
Im2Col<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
checkKernelErrors;
} else {
DivModFast fxyD((cpuIm2Col->kernelX*cpuIm2Col->kernelY));
DivModFast fxD(cpuIm2Col->kernelX);
maxCount = eAlign * lAlign * 4;
block_num = runtime->blocks_num(maxCount);
block_size = runtime->threads_num();
//Im2Col_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD, fxyD, fxD);
Im2Col_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,
eAlignD, owD, ohD, fxyD, fxD);
checkKernelErrors;
}
}
}
} // namespace CUDA
} // namespace MNN

View File

@ -0,0 +1,24 @@
//
// ImageColumn.cuh
// MNN
//
// Created by MNN on 2021/01/10.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef IMAGE_COLUMN_CUH
#define IMAGE_COLUMN_CUH
#include "backend/cuda/core/runtime/CUDARuntime.hpp"
#include "TensorCoreGemm.cuh"
#include "backend/cuda/core/CUDABackend.hpp"
namespace MNN {
namespace CUDA {
void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col, const void* input_addr, __half* mIm2ColBuffer, int bytes);
} // namespace CUDA
} // namespace MNN
#endif

View File

@ -1,27 +1,51 @@
#include "InterpExecution.hpp"
#include "MNNCUDADefine.hpp"
#include "MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
template<typename T>
__global__ void INTERP(const int n, const int ih, const int iw, const int oh, const int ow,
__global__ void INTERP_NERAEST(const int n, const int ih, const int iw, const int oh, const int ow,
const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
CUDA_KERNEL_LOOP(index, n) {
CUDA_KERNEL_LOOP(total, n) {
int index = total / PACK_NUMBER;
int remain = total % PACK_NUMBER;
int x = index % ow;
int tmp = index / ow;
int y = tmp % oh;
int z = tmp / oh;
int ix = min(max(0, (int)floor((float)x*scalew+offsetw)), iw-1);
int iy = min(max(0, (int)floor((float)y*scaleh+offseth)), ih-1);
out[z*oh*ow + y*ow + x] = in[z*ih*iw + iy*iw + ix];
out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
= in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
}
}
template<typename T>
__global__ void INTERP_NERAEST_ROUND(const int n, const int ih, const int iw, const int oh, const int ow,
const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
CUDA_KERNEL_LOOP(total, n) {
int index = total / PACK_NUMBER;
int remain = total % PACK_NUMBER;
int x = index % ow;
int tmp = index / ow;
int y = tmp % oh;
int z = tmp / oh;
int ix = min(max(0, (int)floor((float)x*scalew+offsetw + 0.499f)), iw-1);
int iy = min(max(0, (int)floor((float)y*scaleh+offseth + 0.499f)), ih-1);
out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
= in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
}
}
template<typename T>
__global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const int oh, const int ow,
const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
CUDA_KERNEL_LOOP(index, n) {
CUDA_KERNEL_LOOP(total, n) {
int index = total / PACK_NUMBER;
int remain = total % PACK_NUMBER;
int x = index % ow;
int tmp = index / ow;
int y = tmp % oh;
@ -37,11 +61,97 @@ __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const i
int index_01 = z*ih*iw + iy_0*iw + ix_1;
int index_10 = z*ih*iw + iy_1*iw + ix_0;
int index_11 = z*ih*iw + iy_1*iw + ix_1;
index_00 = index_00 * PACK_NUMBER + remain;
index_01 = index_01 * PACK_NUMBER + remain;
index_10 = index_10 * PACK_NUMBER + remain;
index_11 = index_11 * PACK_NUMBER + remain;
float factor_x = fx-ix_0;
float factor_y = fy-iy_0;
out[z*oh*ow + y*ow + x] = (1.0-factor_x)*(1.0-factor_y)*in[index_00] + factor_x*(1.0-factor_y)*in[index_01] +
(1.0-factor_x)*factor_y*in[index_10] + factor_x*factor_y*in[index_11];
out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] =
(1.0-factor_x)*(1.0-factor_y)*(float)in[index_00] + factor_x*(1.0-factor_y)*(float)in[index_01] +
(1.0-factor_x)*factor_y*(float)in[index_10] + factor_x*factor_y*(float)in[index_11];
}
}
template<typename T>
__global__ void INTERP_BILINEAR_OPT(const int n, const int ih, const int iw, const int oh, const int ow,
const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out,
DivModFast d_ow, DivModFast d_oh) {
CUDA_KERNEL_LOOP(total, n) {
int index = total >> 4;
int remain = total & 15;
int tmp, x_idx, y, z;
d_ow.divmod(index, tmp, x_idx);
d_oh.divmod(tmp, z, y);
size_t x = x_idx << 1;
float fx = x*scalew+offsetw;
int ix_0 = min(max(0, (int)floor(fx)), iw-1);
int ix_1 = min((int)ceil(fx), iw-1);
float fx_1 = fx + scalew;
int ix_2 = min(max(0, (int)floor(fx_1)), iw-1);
int ix_3 = min((int)ceil(fx_1), iw-1);
float fy = y*scaleh+offseth;
int iy_0 = min(max(0, (int)floor(fy)), ih-1);
int iy_1 = min((int)ceil(fy), ih-1);
int index_00 = (z*ih+ iy_0)*iw + ix_0;
int index_01 = index_00 - ix_0 + ix_1;
int index_10 = (z*ih+ iy_1)*iw + ix_0;
int index_11 = index_10 - ix_0 + ix_1;
index_00 = (index_00 << 4) + remain;
index_01 = (index_01 << 4) + remain;
index_10 = (index_10 << 4) + remain;
index_11 = (index_11 << 4) + remain;
float factor_x = fx-ix_0;
float factor_y = fy-iy_0;
float in_00 = (float)in[index_00];
float in_01 = (float)in[index_01];
float in_10 = (float)in[index_10];
float in_11 = (float)in[index_11];
float factor_00 = (1.0-factor_x)*(1.0-factor_y);
float factor_01 = factor_x*(1.0-factor_y);
float factor_10 = (1.0-factor_x)*factor_y;
float factor_11 = factor_x*factor_y;
size_t dstOffset = (((z*oh+ y)*ow + x) << 4) + remain;
out[dstOffset] = \
factor_00* in_00 + factor_01*in_01 + \
factor_10* in_10 + factor_11*in_11;
if(x+1 >= ow) {
continue;
}
if(ix_2 != ix_0) {
index_00 = index_00 + ((ix_2-ix_0) << 4);
index_10 = index_10 + ((ix_2-ix_0) << 4);
in_00 = (float)in[index_00];
in_10 = (float)in[index_10];
}
if(ix_3 != ix_1) {
index_01 = index_01 + ((ix_3-ix_1) << 4);
index_11 = index_11 + ((ix_3-ix_1) << 4);
in_01 = (float)in[index_01];
in_11 = (float)in[index_11];
}
if(factor_x != fx_1-ix_2) {
factor_x = fx_1-ix_2;
factor_00 = (1.0-factor_x)*(1.0-factor_y);
factor_01 = factor_x*(1.0-factor_y);
factor_10 = (1.0-factor_x)*factor_y;
factor_11 = factor_x*factor_y;
}
out[dstOffset+ PACK_NUMBER] = \
factor_00* in_00 + factor_01*in_01 + \
factor_10* in_10 + factor_11*in_11;
}
}
@ -70,7 +180,7 @@ ErrorCode InterpExecution::onResize(const std::vector<Tensor *> &inputs, const s
mOutputHeight = output->height();
mOutputWidth = output->width();
mCount = mBatch*mChannel*mOutputHeight*mOutputWidth;
mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*mOutputWidth * PACK_NUMBER;
//printf("mBatch:%d-mChannel:%d-mInputHeight:%d- mInputWidth:%d- mOutputHeight:%d- mOutputWidth:%d, mScaleHeight:%f- mScaleWidth:%f %f %f\n", mBatch, mChannel, mInputHeight,mInputWidth,mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mWidthOffset, mHeightOffset);
return NO_ERROR;
}
@ -82,13 +192,39 @@ ErrorCode InterpExecution::onExecute(const std::vector<Tensor *> &inputs, const
int threads_num = runtime->threads_num();
auto input_addr = (void*)inputs[0]->deviceId();
auto output_addr = (void*)outputs[0]->deviceId();
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if(mResizeType == 1){
INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
} else if(mResizeType == 2) {
//INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*((mOutputWidth+1)/ 2) * PACK_NUMBER;
block_num = runtime->blocks_num(mCount);
threads_num = runtime->threads_num();
DivModFast d_ow((mOutputWidth+1)/2);
DivModFast d_oh(mOutputHeight);
INTERP_BILINEAR_OPT<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr, d_ow, d_oh);
} else if (mResizeType == 4) {
INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
}
return NO_ERROR;
}
if(mResizeType == 1){
INTERP<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
} else if(mResizeType == 2) {
INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
} else if (mResizeType == 4) {
INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
}
return NO_ERROR;
}
@ -98,7 +234,7 @@ public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
auto param = op->main_as_Interp();
if(param->resizeType() != 1 && param->resizeType() != 2) {
if(param->resizeType() == 3) {
MNN_PRINT("CUDA interp resize type:%d not support, back to CPU\n", param->resizeType());
return nullptr;
}

View File

@ -38,7 +38,7 @@ T blockReduceSum(T val)
template <typename T>
__global__
void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl)
void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)
{
int tid = threadIdx.x;
@ -60,7 +60,7 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int
float var_tmp = 0.0f;
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
var_tmp += ((input[blockIdx.x * n + idx*256 + tid] - s_mean) * (input[blockIdx.x * n + idx*256 + tid] - s_mean));
var_tmp += (((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * ((float)input[blockIdx.x * n + idx*256 + tid] - s_mean));
}
variance += blockReduceSum<float>(var_tmp);
if(threadIdx.x == 0)
@ -69,14 +69,14 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int
for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
out[blockIdx.x * n + idx*256+tid] =
(T)(((input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
(T)((((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
}
}
template <typename T>
__global__
void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
void input_layernorm_2048(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
@ -128,7 +128,7 @@ void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta,
template <typename T>
__global__
void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
void input_layernorm_1024(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
@ -176,7 +176,7 @@ void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta,
template <typename T>
__global__
void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
void input_layernorm_512(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
{
int tid = threadIdx.x;
@ -217,25 +217,25 @@ void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta,
template<typename T>
__global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon,
const T* in, T* out, const T* gamma_data, const T* beta_data) {
const T* in, T* out, const float* gamma_data, const float* beta_data) {
CUDA_KERNEL_LOOP(i, count) {
const int o = i / inside;
const int index = i % inside;
const T* inner_input = in + o * inside;
T* inner_output = out + o * inside;
T sum = 0.f;
float sum = 0.f;
for (int j = 0; j < inside; ++j) {
sum += inner_input[j];
sum += (float)inner_input[j];
}
T mean = sum / inside;
T square_sum = 0.f;
float mean = sum / inside;
float square_sum = 0.f;
for (int j = 0; j < inside; ++j) {
square_sum += (inner_input[j] - mean) * (inner_input[j] - mean);
square_sum += ((float)inner_input[j] - mean) * ((float)inner_input[j] - mean);
}
T variable = square_sum / inside;
float variable = square_sum / inside;
variable = 1.f / sqrt(variable + epsilon);
inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
inner_output[index] = ((float)inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
}
}
@ -249,7 +249,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
mEps = layer_norm_param->epsilon();
int size = layer_norm_param->gamma()->size();
mGammaTensor.reset(Tensor::createDevice<float>({size}));
mGammaTensor.reset(Tensor::createDevice<int32_t>({size}));
auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n");
@ -262,7 +262,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
if (layer_norm_param->beta()->size() != size) {
MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n");
}
mBetaTensor.reset(Tensor::createDevice<float>({size}));
mBetaTensor.reset(Tensor::createDevice<int32_t>({size}));
status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC);
if (!status) {
MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n");
@ -274,12 +274,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
}
LayerNormExecution::~LayerNormExecution() {
if (nullptr != mGammaTensor) {
backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC);
}
if (nullptr != mBetaTensor) {
backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC);
}
// Do nothing
}
ErrorCode LayerNormExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -314,6 +309,28 @@ ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, con
int threads_num = runtime->threads_num();
auto input_addr = (void*)inputs[0]->deviceId();
auto output_addr = (void*)outputs[0]->deviceId();
if (static_cast<CUDABackend*>(backend())->useFp16()) {
if(mInside < 128) {
LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const half *)input_addr, (half *)output_addr,
(const float *)mDeviceGamma, (const float *)mDeviceBeta);
} else {
if(mInside == 2048) {
input_layernorm_2048<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else if(mInside == 1024) {
input_layernorm_1024<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else if(mInside == 512) {
input_layernorm_512<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps);
} else {
int sumPerKnl = (mInside+255) / 256;
input_layernorm<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma,
(const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl);
}
}
return NO_ERROR;
}
if(mInside < 128) {
LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr,

View File

@ -6,7 +6,6 @@
// Copyright © 2018, Alibaba Group Holding Limited
//
#include <map>
#include "BatchMatMulExecution.hpp"
#include "MatMulExecution.hpp"
#include "backend/cuda/core/CUDABackend.hpp"
#include "Raster.cuh"
@ -34,18 +33,21 @@ public:
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op();
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
auto& unit = mExecutions[0];
unit.exe.reset(new BatchMatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend()));
if (nullptr == unit.exe) {
return OUT_OF_MEMORY;
}
unit.inputs = inputs;
unit.outputs = outputs;
auto code = unit.exe->onResize(unit.inputs, unit.outputs);
if (NO_ERROR != code) {
return code;
if (inputs.size() <= 3) {
auto& unit = mExecutions[0];
unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend()));
if (nullptr == unit.exe) {
return OUT_OF_MEMORY;
}
unit.inputs = inputs;
unit.outputs = outputs;
auto code = unit.exe->onResize(unit.inputs, unit.outputs);
if (NO_ERROR != code) {
return code;
}
mSingleMatMul = true;
return NO_ERROR;
}
return NO_ERROR;
}
}
@ -134,21 +136,22 @@ public:
virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
if (mSingleMatMul) {
auto& unit = mExecutions[0];
unit.inputs = originInputs;
unit.outputs = originOutputs;
auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
if (NO_ERROR != code) {
return code;
}
return NO_ERROR;
}
if (1 == mLoop->commands()->size()) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op();
if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
auto& unit = mExecutions[0];
unit.inputs = originInputs;
unit.outputs = originOutputs;
auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
if (NO_ERROR != code) {
return code;
}
return NO_ERROR;
}
if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
Tensor::InsideDescribe::Region reg;
@ -160,7 +163,7 @@ public:
auto input = mStack[cmd->indexes()->data()[1]];
auto inputSize = input->elementSize();
auto output = mStack[cmd->indexes()->data()[0]];
auto bytes = input->getType().bytes();
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
auto step0 = cmd->steps()->data()[0];
auto step1 = cmd->steps()->data()[1];
auto loopNumber = mLoop->loopNumber();
@ -189,7 +192,7 @@ public:
for (auto& iter : mIndiceCopy) {
backend()->onCopyBuffer(iter.first, iter.second);
}
auto bytes = sizeof(float);//TODO: Support Half
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(originOutputs[0]);
for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
for (int index=0; index<mLoop->commands()->size(); ++index) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
@ -205,7 +208,7 @@ public:
}
auto view = cmd->view()->GetAs<View>(v);
offset = offset * cmd->steps()->data()[v] + view->offset();
mStackPtr[tensorIndex] = tensor->deviceId() + offset * bytes;
mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
}
if (OpType_UnaryOp == op->type()) {
auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
@ -233,6 +236,10 @@ public:
continue;
}
if (OpType_BinaryOp == op->type()) {
auto type = halide_type_of<float>();
if (static_cast<CUDABackend*>(backend())->useFp16()) {
type.bits = 16;
}
auto src0 = mStackPtr[cmd->indexes()->data()[1]];
auto src1 = mStackPtr[cmd->indexes()->data()[2]];
auto dst = mStackPtr[cmd->indexes()->data()[0]];
@ -242,7 +249,7 @@ public:
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
cmd->size()->data(), srcStride0, srcStride1, dstStride, halide_type_of<float>(), runtime, opType);
cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
}
}
@ -256,6 +263,7 @@ private:
std::vector<Unit> mExecutions;
std::vector<uint64_t> mStackPtr;
std::map<Tensor*, Tensor*> mIndiceCopy;
bool mSingleMatMul = false;
};
class LoopCreator : public CUDABackend::Creator {

View File

@ -0,0 +1,18 @@
#ifndef MNNCUDADEFINE_HPP
#define MNNCUDADEFINE_HPP
#define PACK_NUMBER 16
#define MNN_CUDA_HALF2_MAX(a, b) \
do { \
(a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
(a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
} while (0)
#define MNN_CUDA_HALF2_MIN(a, b) \
do { \
(a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
(a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
} while (0)
#endif

View File

@ -0,0 +1,38 @@
#ifndef MNNCUDAFunction_cuh
#define MNNCUDAFunction_cuh
struct DivModFast {
DivModFast(int d = 1)
{
d_ = (d == 0) ? 1 : d;
for (l_ = 0;; ++l_) {
if ((1U << l_) >= d_)
break;
}
uint64_t one = 1;
uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
m_ = static_cast<uint32_t>(m);
}
__device__ __inline__ int div(int idx) const
{
uint32_t tm = __umulhi(m_, idx); // get high 32-bit of the product
return (tm + idx) >> l_;
}
__device__ __inline__ int mod(int idx) const
{
return idx - d_ * div(idx);
}
__device__ __inline__ void divmod(int idx, int &quo, int &rem)
{
quo = div(idx);
rem = idx - quo * d_;
}
uint32_t d_; // divisor
uint32_t l_; // ceil(log2(d_))
uint32_t m_; // m' in the papaer
};
#endif

View File

@ -15,12 +15,18 @@ MatMulExecution::~ MatMulExecution() {
}
ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto w0 = inputs[0]->length(1);
auto h0 = inputs[0]->length(0);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto C = outputs[0];
auto dimensions = C->dimensions();
int batch = 1;
for (int i = 0; i < dimensions - 2; ++i) {
batch *= C->length(i);
}
auto e = C->length(dimensions-2);
auto h = C->length(dimensions-1);
auto w0 = inputs[0]->length(dimensions-1);
auto h0 = inputs[0]->length(dimensions-2);
auto e = C->length(0);
auto h = C->length(1);
auto l = w0;
if (mTransposeA) {
l = h0;
@ -29,6 +35,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
param.elh[0] = e;
param.elh[1] = l;
param.elh[2] = h;
param.batch = batch;
auto eU = UP_DIV(e, PACK_MATMUL);
auto lU = UP_DIV(l, PACK_MATMUL);
auto hU = UP_DIV(h, PACK_MATMUL);
@ -58,15 +65,17 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
param.cStride[0] = h;
param.cStride[1] = 0;
param.cStride[2] = 1;
param.split[0] = 1;
param.split[1] = 1;
param.split[2] = 1;
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
param.aPStride[0] = 256 * lU;
param.aPStride[1] = 16;
param.aPStride[2] = 16 * lU;
param.bPStride[0] = 256 * lU;
param.bPStride[1] = 16;
param.bPStride[2] = 16 * lU;
runtime->memcpy((uint8_t*)mParameters.first + mParameters.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
// Alloc for temp buffer
auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL;
auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL;
auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL * batch;
auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL * batch;
auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
mTempA = pool->alloc(aPackSize * sizeof(__half), false, 256);
@ -85,6 +94,11 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
auto APtr = (const float*)A->deviceId();
auto BPtr = (const float*)B->deviceId();
auto CDestPtr = (float*)C->deviceId();
int e = mParam.elh[0];
int l = mParam.elh[1];
int h = mParam.elh[2];
int batch = mParam.batch;
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
auto aP = (__half*)((uint8_t*)mTempA.first + mTempA.second);
auto bP = (__half*)((uint8_t*)mTempB.first + mTempB.second);
@ -93,53 +107,8 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
biasPtr = (const float*)inputs[2]->deviceId();
}
auto param = (MatMulParam*)((uint8_t*)mParameters.first + mParameters.second);
GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP);
GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr);
return NO_ERROR;
auto blasHandle = runtime->cublas_handle();
auto w0 = inputs[0]->length(1);
auto h0 = inputs[0]->length(0);
auto e = C->length(0);
auto h = C->length(1);
auto l = w0;
if (mTransposeA) {
l = h0;
}
float alpha = 1.0f;
float beta = 0.0f;
auto tranB = CUBLAS_OP_N;
auto ldB = h;
if (mTransposeB) {
ldB = l;
tranB = CUBLAS_OP_T;
}
auto tranA = CUBLAS_OP_N;
auto ldA = l;
if (mTransposeA) {
ldA = e;
tranA = CUBLAS_OP_T;
}
int block_num = runtime->blocks_num(e*h);
int threads_num = runtime->threads_num();
//[e, l] x [l, h] -> [e, h]
auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h);
cublas_check(status);
//cudaThreadSynchronize();
// } else {
// auto CPtr = (float*)mTempOutput->deviceId();
// auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h);
// cublas_check(status);
// //cudaThreadSynchronize();
// //bias: [e, h] + [h] -> [e, h]
// add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
// }
GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP, bytes);
GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr, bytes, false, false);
return NO_ERROR;
}

View File

@ -28,6 +28,7 @@ private:
std::pair<void*, int> mTempB;
std::pair<void*, int> mParameters; // In GPU
MatMulParam mParam; // In CPU
bool mUseBlas = false;
};
} // namespace CUDA
} // namespace MNN

View File

@ -1,62 +1,71 @@
#include "PReLUExecution.hpp"
#include "MNNCUDADefine.hpp"
namespace MNN {
namespace CUDA {
#define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
template<typename T>
__global__ void PRELU(const int n, const int channels, const int dim, const T* in, T* out,
const T* slopeData, int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
const float* slopeData, int div_factor) {
CUDA_KERNEL_LOOP(t, n) {
int index = t / PACK_NUMBER;
int r = t % PACK_NUMBER;
int c = (index / dim) % channels / div_factor;
out[index] = in[index] > 0 ? in[index] : in[index]*slopeData[c];
float iv = (float)in[t];
float ov = iv > 0.0 ? iv : iv * slopeData[c * PACK_NUMBER + r];
out[t] = (T)ov;
}
}
PReLUExecution::PReLUExecution(const PRelu* prelu, Backend *backend) : Execution(backend) {
int slopCount = prelu->slope()->size();
auto alphaData = prelu->slope()->data();
preluTensor.reset(Tensor::createDevice<float>({slopCount}));
backend->onAcquireBuffer(preluTensor.get(), Backend::STATIC);
mDeviceSlope = (void *)preluTensor.get()->buffer().device;
auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
auto slopeSize = UP_DIV(slopCount, PACK_NUMBER) * PACK_NUMBER * sizeof(float);
mPreluStorage = staticPool->alloc(slopeSize);
mDeviceSlope = (uint8_t*)mPreluStorage.first + mPreluStorage.second;
MNN_ASSERT(nullptr != mDeviceSlope);
cudaMemset(mDeviceSlope, 0, slopeSize);
cudaMemcpy(mDeviceSlope, alphaData, slopCount * sizeof(float), cudaMemcpyHostToDevice);
mIsChannelShared = slopCount == 1;
}
PReLUExecution::~PReLUExecution() {
if (nullptr != preluTensor) {
backend()->onReleaseBuffer(preluTensor.get(), Backend::STATIC);
}
auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
staticPool->free(mPreluStorage);
}
ErrorCode PReLUExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
auto input = inputs[0];
mBatch = input->length(0);
mChannel = input->length(1);
MNN_ASSERT(input->dimensions() >= 2);
mArea = 1;
mArea = input->length(0);
for (int i = 2; i < input->dimensions(); ++i) {
mArea *= input->length(i);
}
mCount = mBatch*mChannel*mArea;
mChannel = UP_DIV(input->length(1), PACK_NUMBER);
mCount = mChannel*mArea * PACK_NUMBER;
//printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount);
return NO_ERROR;
}
ErrorCode PReLUExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
int block_num = runtime->blocks_num(mCount);
int threads_num = runtime->threads_num();
auto input_addr = (void*)inputs[0]->deviceId();
auto output_addr = (void*)outputs[0]->deviceId();
int div_factor = mIsChannelShared ? mChannel : 1;
PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
(const float *)mDeviceSlope, div_factor);
if (2 == bytes) {
PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr,
(const float *)mDeviceSlope, div_factor);
} else {
PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
(const float *)mDeviceSlope, div_factor);
}
return NO_ERROR;
}

View File

@ -29,11 +29,9 @@ private:
CUDARuntime *mRuntime;
void *mDeviceSlope = nullptr;
int mCount;
int mBatch;
int mChannel;
int mArea;
std::shared_ptr<Tensor> preluTensor;
std::pair<void*, int> mPreluStorage;
bool mIsChannelShared = false;
};

View File

@ -1,90 +1,209 @@
#include <cuda_fp16.h>
#include "PoolExecution.hpp"
#include <float.h>
#include "MNNCUDADefine.hpp"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void avgpool(const T* uInput, T* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow;
#define HALF_MIN half(-65504)
#define HALF2_MIN half2(-65504, -65504)
#define MNN_CUDA_HALF2_MAX(a, b) \
do { \
(a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
(a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
} while (0)
__global__ void maxpool_halfC16(const half* uInput, half* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow * 8;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % ow;
int tmp = i / ow;
int y = tmp % oh;
int z = tmp / oh;
int zC = z / 8;
int zR = z % 8;
int ix = x * strideX - padX;
int iy = y * strideY - padY;
int sx = max(0, -ix);
int sy = max(0, -iy);
int ex = min(kernelX, iw - ix);
int ey = min(kernelY, ih - iy);
T sumValue = (T)0;
float div = (float)(ey-sy)* (float)(ex-sx);
half2 sumValue = HALF2_MIN;
for (int fy=sy; fy<ey; ++fy) {
for (int fx=sx; fx<ex; ++fx)
{
for (int fx=sx; fx<ex; ++fx) {
int currentX = ix + fx;
int currentY = iy + fy;
T inputColor = uInput[0
+ z * iw * ih
+ currentY * iw
+ currentX
];
sumValue = sumValue + inputColor;
const half2* input = (const half2*)(uInput
+ zR * 2
+ currentX * 16
+ currentY * iw * 16
+ zC * iw * ih * 16
);
half2 inputV = *input;
MNN_CUDA_HALF2_MAX(sumValue, inputV);
}
}
uOutput[0
+ z * ow * oh
+ y * ow
+ x
] = sumValue / ((T)(ey-sy)*(T)(ex-sx));
half2* dst = (half2*)(uOutput
+ zC * ow * oh * 16
+ y * ow * 16
+ x * 16
+ zR * 2
);
*dst = sumValue;
}
}
template <typename T>
__global__ void maxpool(const T* uInput, T* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow;
__global__ void avgpool_halfC16(const half* uInput, half* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow * 8;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % ow;
int tmp = i / ow;
int y = tmp % oh;
int z = tmp / oh;
int zC = z / 8;
int zR = z % 8;
int ix = x * strideX - padX;
int iy = y * strideY - padY;
int sx = max(0, -ix);
int sy = max(0, -iy);
int ex = min(kernelX, iw - ix);
int ey = min(kernelY, ih - iy);
T maxValue = (T)(-1000000);
float div = (float)(ey-sy)* (float)(ex-sx);
half2 sumValue = half2(0.0f, 0.0f);
half2 mulValue = half2(1.0f / div, 1.0f/div);
for (int fy=sy; fy<ey; ++fy) {
for (int fx=sx; fx<ex; ++fx)
{
for (int fx=sx; fx<ex; ++fx) {
int currentX = ix + fx;
int currentY = iy + fy;
T inputColor = uInput[0
+ z * iw * ih
+ currentY * iw
+ currentX
];
maxValue = max(inputColor, maxValue);
const half2* input = (const half2*)(uInput
+ zR * 2
+ currentX * 16
+ currentY * iw * 16
+ zC * iw * ih * 16
);
sumValue = __hadd2(sumValue, (*input) * mulValue);
}
}
uOutput[0
+ z * ow * oh
+ y * ow
+ x
] = maxValue;
half2* dst = (half2*)(uOutput
+ zC * ow * oh * 16
+ y * ow * 16
+ x * 16
+ zR * 2
);
*dst = sumValue;
}
}
__global__ void maxpool_floatC16(const float* uInput, float* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow * PACK_NUMBER;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % ow;
int tmp = i / ow;
int y = tmp % oh;
int z = tmp / oh;
int zC = z / PACK_NUMBER;
int zR = z % PACK_NUMBER;
int ix = x * strideX - padX;
int iy = y * strideY - padY;
int sx = max(0, -ix);
int sy = max(0, -iy);
int ex = min(kernelX, iw - ix);
int ey = min(kernelY, ih - iy);
float maxValue = -FLT_MAX;
for (int fy=sy; fy<ey; ++fy) {
for (int fx=sx; fx<ex; ++fx) {
int currentX = ix + fx;
int currentY = iy + fy;
const float* input = (const float*)(uInput
+ zR
+ currentX * PACK_NUMBER
+ currentY * iw * PACK_NUMBER
+ zC * iw * ih * PACK_NUMBER
);
maxValue = max(maxValue, *input);
}
}
float* dst = (float*)(uOutput
+ zC * ow * oh * PACK_NUMBER
+ y * ow * PACK_NUMBER
+ x * PACK_NUMBER
+ zR
);
*dst = maxValue;
}
}
__global__ void avgpool_floatC16(const float* uInput, float* uOutput,
int bc,
int ih, int iw,
int oh, int ow,
int padX, int padY,
int kernelX, int kernelY,
int strideX, int strideY
) {
int total = bc * oh * ow * PACK_NUMBER;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % ow;
int tmp = i / ow;
int y = tmp % oh;
int z = tmp / oh;
int zC = z / PACK_NUMBER;
int zR = z % PACK_NUMBER;
int ix = x * strideX - padX;
int iy = y * strideY - padY;
int sx = max(0, -ix);
int sy = max(0, -iy);
int ex = min(kernelX, iw - ix);
int ey = min(kernelY, ih - iy);
float div = (float)(ey-sy)* (float)(ex-sx);
float sumValue = 0.0f;
float mulValue = 1.0f/div;
for (int fy=sy; fy<ey; ++fy) {
for (int fx=sx; fx<ex; ++fx) {
int currentX = ix + fx;
int currentY = iy + fy;
const float* input = (const float*)(uInput
+ zR
+ currentX * PACK_NUMBER
+ currentY * iw * PACK_NUMBER
+ zC * iw * ih * PACK_NUMBER
);
sumValue = sumValue + (*input) * mulValue;
}
}
float* dst = (float*)(uOutput
+ zC * ow * oh * 16
+ y * ow * 16
+ x * 16
+ zR
);
*dst = sumValue;
}
}
ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto layer = mParameter;
int strideWidth = layer->strideX();
@ -128,34 +247,62 @@ ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std
ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto iw = inputs[0]->width();
auto ih = inputs[0]->height();
auto bc = inputs[0]->batch() * inputs[0]->channel();
auto bc = inputs[0]->batch() * UP_DIV(inputs[0]->channel(), PACK_NUMBER);
auto ow = outputs[0]->width();
auto oh = outputs[0]->height();
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int block_num = runtime->blocks_num(bc * ow * oh);
int threads_num = runtime->threads_num();
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int block_num = prop.multiProcessorCount;
if (static_cast<CUDABackend*>(backend())->useFp16()) {
auto inputPtr = (const half*)inputs[0]->deviceId();
auto outputPtr = (half*)outputs[0]->deviceId();
switch (mPoolType) {
case PoolType_AVEPOOL:
avgpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr,
bc,
ih, iw,
oh, ow,
mPaddings[0], mPaddings[1],
mKernels[0], mKernels[1],
mStrides[0], mStrides[1]
);
return NO_ERROR;
case PoolType_MAXPOOL:
maxpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr,
bc,
ih, iw,
oh, ow,
mPaddings[0], mPaddings[1],
mKernels[0], mKernels[1],
mStrides[0], mStrides[1]
);
return NO_ERROR;
}
return NO_ERROR;
}
auto inputPtr = (const float*)inputs[0]->deviceId();
auto outputPtr = (float*)outputs[0]->deviceId();
switch (mPoolType) {
case PoolType_AVEPOOL:
avgpool<<<block_num, threads_num>>>(inputPtr, outputPtr,
avgpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr,
bc,
ih, iw,
oh, ow,
mPaddings[0], mPaddings[1],
mKernels[0], mKernels[1],
mStrides[0], mStrides[1]
);
);
return NO_ERROR;
case PoolType_MAXPOOL:
maxpool<<<block_num, threads_num>>>(inputPtr, outputPtr,
maxpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr,
bc,
ih, iw,
oh, ow,
mPaddings[0], mPaddings[1],
mKernels[0], mKernels[1],
mStrides[0], mStrides[1]
);
);
return NO_ERROR;
}
return NOT_SUPPORT;

View File

@ -1,89 +1,22 @@
#include "Raster.cuh"
#include "TensorflowOp_generated.h"
#include <cuda_fp16.h>
#include "MNNCUDAFunction.cuh"
namespace MNN {
namespace CUDA {
template <typename T>
__global__ void pack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
int total = inside * axis * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % inside;
int tmp = i / inside;
int y = tmp % axis;
int z = tmp / axis;
int y4 = y / 4;
int yR = y % 4;
int dstOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
output[dstOffset] = input[i];
}
}
void PackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
auto packAxis = (axis + 3) / 4;
if (axis % 4 != 0) {
runtime->memset(output, 0, inside * packAxis * 4 * outside * bytes);
}
int block_num = runtime->blocks_num(inside * axis * outside);
int threads_num = runtime->threads_num();
switch (bytes) {
case 4:
pack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
break;
case 2:
pack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
break;
case 1:
pack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
break;
default:
break;
}
}
template <typename T>
__global__ void unpack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
int total = inside * axis * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int x = i % inside;
int tmp = i / inside;
int y = tmp % axis;
int z = tmp / axis;
int y4 = y / 4;
int yR = y % 4;
int srcOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
output[i] = input[srcOffset];
}
}
void UnpackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
auto packAxis = (axis + 3) / 4;
int block_num = runtime->blocks_num(inside * axis * outside);
int threads_num = runtime->threads_num();
switch (bytes) {
case 4:
unpack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
break;
case 2:
unpack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
break;
case 1:
unpack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
break;
default:
break;
}
}
// Blit don't care offset
template <typename T>
__global__ void blitRegion(const T *inputO, T *outputO,
int loopCount,
const int32_t* dstIndice, const int32_t* srcIndice,
int dstUseIndice, int srcUseIndice,
int dstStep, int srcStep,int srcLimit,
int sizeZ, int sizeY, int sizeX,
int strideZ, int strideY, int strideX,
int dstStrideZ, int dstStrideY, int dstStrideX
) {
int loopCount,
const int32_t* dstIndice, const int32_t* srcIndice,
int dstUseIndice, int srcUseIndice,
int dstStep, int srcStep,int srcLimit,
int sizeZ, int sizeY, int sizeX,
int strideZ, int strideY, int strideX,
int dstStrideZ, int dstStrideY, int dstStrideX
) {
int total = loopCount;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
int srcOffsetO = i * srcStep;
@ -162,29 +95,66 @@ void BlitWithIndice(uint8_t* output, const uint8_t* input, const int32_t* dstInd
#define UNARY_FUNC(Name, Func)\
template<typename T>\
__global__ void Name(const T *input, T *output,\
int sizeZ, int sizeY, int sizeX,\
int count,\
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
int strideZ, int strideY, int strideX,\
int dstStrideZ, int dstStrideY, int dstStrideX\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
int total = sizeZ * sizeY * sizeX;\
int ix = i % sizeX;\
int tmp = i / sizeX;\
int iy = tmp % sizeY;\
int iz = tmp / sizeY;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
int ix, tmp, iy, iz;\
sizeX.divmod(i, tmp, ix);\
sizeY.divmod(tmp, iz, iy);\
int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
T x = input[srcOffset];\
output[dstOffset] = Func;\
}\
}\
template<typename T>\
__global__ void FLOAT##Name(const T *input, T *output,\
int count,\
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
int strideZ, int strideY, int strideX,\
int dstStrideZ, int dstStrideY, int dstStrideX\
) { \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
int ix, tmp, iy, iz;\
sizeX.divmod(i, tmp, ix);\
sizeY.divmod(tmp, iz, iy);\
int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
float x = (float)input[srcOffset];\
output[dstOffset] = (float)(Func);\
}\
}\
template<typename T>
__global__ void blit_2(const T *input, T *output,
int count,
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
int strideZ, int strideY,
int dstStrideZ, int dstStrideY
) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
int ix, tmp, iy, iz;
sizeX.divmod(i, tmp, ix);
sizeY.divmod(tmp, iz, iy);
int srcOffset = iz * strideZ + iy * strideY + (ix << 1);
int dstOffset = iz * dstStrideZ + iy * dstStrideY + (ix << 1);
int2 * dstF = (int2 *)(output+dstOffset);
dstF[0] = ((int2 *)(input+srcOffset))[0];
}
}
struct Bytes512 {
int4 x[4];
};
UNARY_FUNC(blit, x);
UNARY_FUNC(ABS, abs(x));
UNARY_FUNC(EXP, exp(x));
UNARY_FUNC(NEG, -x);
UNARY_FUNC(RECIPROCAL, (T)(1.0)/x);
UNARY_FUNC(RECIPROCAL, (1.0)/x);
UNARY_FUNC(FLOOR, floor(x));
UNARY_FUNC(CEIL, ceil(x));
UNARY_FUNC(SQUARE, x*x);
@ -212,27 +182,68 @@ UNARY_FUNC(HARDSWISH, 1.0/6.0 * x * min(max(x+3.0, 0.0), 6.0));
UNARY_FUNC(ERF, erf(x));
UNARY_FUNC(ERFC, erfc(x));
UNARY_FUNC(ERFINV, erfinv(x));
UNARY_FUNC(GELU, (1.0f + tanh(0.79788458f * (0.044715f * x * x * x + x))) * x * 0.5f);
UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
int count = size[0] * size[1] * size[2];
DivModFast sz(size[0]);
DivModFast sy(size[1]);
DivModFast sx(size[2]);
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
//printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
count /= 2;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
DivModFast sx_2((size[2]/2));
blit_2<<<block_num, threads_num>>>((const float*)input, (float*)output,
count,
sz, sy, sx_2,
srcStride[0], srcStride[1],
dstStride[0], dstStride[1]);
return;
}
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
switch (bytes) {
case 64:
blit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output,
count,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 32:
blit<<<block_num, threads_num>>>((const double4*)input, (double4*)output,
count,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 4:
blit<<<block_num, threads_num>>>((const float*)input, (float*)output,
size[0], size[1], size[2],
count,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 2:
blit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
size[0], size[1], size[2],
count,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 1:
blit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
size[0], size[1], size[2],
count,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
@ -241,59 +252,131 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
}
}
template<typename T>
__global__ void fuseblit(const T *input, T *output,
int fuseNum, const int32_t* sliceOffset,
int sizeZ, int sizeY, int sizeX,
int strideZ, int strideY, int strideX,
int dstStrideZ, int dstStrideY, int dstStrideX
) {
int count = fuseNum*sizeZ * sizeY * sizeX;
for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
int j = c / (sizeZ * sizeY * sizeX);
int i = c % (sizeZ * sizeY * sizeX);
int ix = i % sizeX;
int tmp = i / sizeX;
int iy = tmp % sizeY;
int iz = tmp / sizeY;
template<typename T0, typename T1>
__global__ void fuseblit(const T0 *input, T1 *output,
int fuseNum, int count, const int32_t* sliceOffset,
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
int strideZ, int strideY, int strideX,
int dstStrideZ, int dstStrideY, int dstStrideX
) {
size_t c = blockIdx.x * blockDim.x + threadIdx.x;
for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
int ix, tmp, iy, tmp2, iz, j;
sizeX.divmod(c, tmp, ix);
sizeY.divmod(tmp, tmp2, iy);
sizeZ.divmod(tmp2, j, iz);
int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + ix * strideX;
int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
output[dst_offset] = input[src_offset];
}
}
template<typename T0, typename T1>
__global__ void fuseblit_4(const T0 *input, T1 *output,
int fuseNum, int count, const int32_t* sliceOffset,
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
int strideZ, int strideY,
int dstStrideZ, int dstStrideY
) {
for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
int ix, tmp, iy, tmp2, iz, j;
sizeX.divmod(c, tmp, ix);
sizeY.divmod(tmp, tmp2, iy);
sizeZ.divmod(tmp2, j, iz);
int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
int4* srcF = (int4 *)(input + src_offset);
int4* dstF = (int4 *)(output + dst_offset);
dstF[0] = srcF[0];
}
}
template<typename T0, typename T1>
__global__ void fuseblit_half_4(const T0 *input, T1 *output,
int fuseNum, int count, const int32_t* sliceOffset,
DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
int strideZ, int strideY,
int dstStrideZ, int dstStrideY
) {
for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
int ix, tmp, iy, tmp2, iz, j;
sizeX.divmod(c, tmp, ix);
sizeY.divmod(tmp, tmp2, iy);
sizeZ.divmod(tmp2, j, iz);
int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
int2* srcF = (int2 *)(input + src_offset);
int2* dstF = (int2 *)(output + dst_offset);
dstF[0] = srcF[0];
}
}
void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime) {
int count = size[0] * size[1] * size[2];
DivModFast sz(size[0]);
DivModFast sy(size[1]);
DivModFast sx(size[2]);
int count = fuseNum * size[0] * size[1] * size[2];
if(size[2] % 4 == 0 && count > 16384 && srcStride[2] == 1 && dstStride[2] == 1) {
//printf("%d-%d-%d, %d-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], dstStride[0], dstStride[1]);
int count = fuseNum * size[0] * size[1] * size[2] / 4;
int numBlocks = runtime->blocks_num(count);
int threadsPerBlock = runtime->threads_num();
DivModFast sx_4((size[2]/4));
if(bytes == 4) {
fuseblit_4<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx_4,
srcStride[0], srcStride[1],
dstStride[0], dstStride[1]);
return;
} else if(bytes == 2){
fuseblit_half_4<<<numBlocks, threadsPerBlock>>>((const half*)input, (half*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx_4,
srcStride[0], srcStride[1],
dstStride[0], dstStride[1]);
return;
}
}
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
int numBlocks = block_num;
int threadsPerBlock = threads_num;
// dim3 numBlocks(block_num, fuseNum);
// dim3 threadsPerBlock(threads_num, 1);
switch (bytes) {
case 64:
fuseblit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 16:
fuseblit<<<block_num, threads_num>>>((const int4*)input, (int4*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 4:
fuseblit<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output,
fuseNum, (const int32_t*)sliceOffset,
size[0], size[1], size[2],
fuseblit<<<block_num, threads_num>>>((const float*)input, (float*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 2:
fuseblit<<<numBlocks, threadsPerBlock>>>((const int16_t*)input, (int16_t*)output,
fuseNum, (const int32_t*)sliceOffset,
size[0], size[1], size[2],
fuseblit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
case 1:
fuseblit<<<numBlocks, threadsPerBlock>>>((const int8_t*)input, (int8_t*)output,
fuseNum, (const int32_t*)sliceOffset,
size[0], size[1], size[2],
fuseblit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
fuseNum, count, (const int32_t*)sliceOffset,
sz, sy, sx,
srcStride[0], srcStride[1], srcStride[2],
dstStride[0], dstStride[1], dstStride[2]);
break;
@ -303,18 +386,112 @@ void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size,
//printf("%s, %d-%d-%d-%d\n", cudaGetErrorString(cudaGetLastError()), numBlocks.x, numBlocks.y, threadsPerBlock.x, threadsPerBlock.y);
}
template<typename T0, typename T1>
__global__ void fuseblitLimit(const T0 *input, T1 *output,
const FuseRegion* info, const int32_t* sliceOffset
) {
int sizeZ = info->size[0];
int sizeY = info->size[1];
int sizeX = info->size[2];
int strideZ = info->srcStride[0];
int strideY = info->srcStride[1];
int strideX = info->srcStride[2];
int dstStrideZ = info->dstStride[0];
int dstStrideY = info->dstStride[1];
int dstStrideX = info->dstStride[2];
int fuseNum = info->fuseNumber;
int count = fuseNum*sizeZ * sizeY * sizeX;
for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
int j = c / (sizeZ * sizeY * sizeX);
int i = c % (sizeZ * sizeY * sizeX);
int ix = i % sizeX;
int tmp = i / sizeX;
int iy = tmp % sizeY;
int iz = tmp / sizeY;
const int* srcOffsetPtr = sliceOffset + 8 * j;
const int* dstOffsetPtr = sliceOffset + 8 * j + 4;
T0 srcValue = (T0)0;
int src_offset = srcOffsetPtr[3] + iz * strideZ + iy * strideY + ix * strideX;
if (srcOffsetPtr[0] > iz && srcOffsetPtr[1] > iy && srcOffsetPtr[2] > ix) {
srcValue = input[src_offset];
}
int dst_offset = dstOffsetPtr[3] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
//printf("%d -> %d - %f\n", src_offset, dst_offset, srcValue);
if (dstOffsetPtr[0] > iz && dstOffsetPtr[1] > iy && dstOffsetPtr[2] > ix) {
output[dst_offset] = srcValue;
}
}
}
void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int block_num = prop.multiProcessorCount;
fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (half*)output,
info, (const int32_t*)sliceOffset);
}
void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int block_num = prop.multiProcessorCount;
fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (float*)output,
info, (const int32_t*)sliceOffset);
}
void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int block_num = prop.multiProcessorCount;
fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output,
info, (const int32_t*)sliceOffset);
}
void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes) {
auto& prop = runtime->prop();
int threads_num = prop.maxThreadsPerBlock;
int block_num = prop.multiProcessorCount;
switch (bytes) {
case 4:
fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output,
info, (const int32_t*)sliceOffset);
break;
case 2:
fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (half*)output,
info, (const int32_t*)sliceOffset);
break;
case 1:
fuseblitLimit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
info, (const int32_t*)sliceOffset);
break;
default:
break;
}
}
void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
int count = size[0] * size[1] * size[2];
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
DivModFast sz(size[0]);
DivModFast sy(size[1]);
DivModFast sx(size[2]);
// TODO: Support FP16
MNN_ASSERT(bytes==4);
#define COMPUTE(TYPE)\
if (opType == MNN::UnaryOpOperation_##TYPE ) {\
TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
size[0], size[1], size[2],\
if(bytes==2) {\
FLOAT##TYPE<<<block_num, threads_num>>>((const half*)input, (half*)output,\
count, \
sz, sy, sx,\
srcStride[0], srcStride[1], srcStride[2],\
dstStride[0], dstStride[1], dstStride[2]);\
} else {\
TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
count, \
sz, sy, sx,\
srcStride[0], srcStride[1], srcStride[2],\
dstStride[0], dstStride[1], dstStride[2]);\
}\
return;\
}\
@ -330,6 +507,8 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
COMPUTE(SIN);
COMPUTE(COS);
COMPUTE(TAN);
COMPUTE(GELU);
COMPUTE(GELU_STANDARD);
COMPUTE(ASIN);
COMPUTE(ACOS);
COMPUTE(ATAN);
@ -356,26 +535,126 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
#define BINARY_FUNC(Name, Func)\
template<typename TIn, typename TOut>\
__global__ void Binary##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int sizeZ, int sizeY, int sizeX,\
int strideZ, int strideY, int strideX,\
int strideZ1, int strideY1, int strideX1,\
int dstStrideZ, int dstStrideY, int dstStrideX\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
int total = sizeZ * sizeY * sizeX;\
int ix = i % sizeX;\
int tmp = i / sizeX;\
int iy = tmp % sizeY;\
int iz = tmp / sizeY;\
int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
TIn x = input0[srcOffset];\
TIn y = input1[srcOffset1];\
output[dstOffset] = (TOut)Func;\
}\
const TIn *input0, const TIn* input1, TOut *output,\
int sizeZ, int sizeY, int sizeX,\
int strideZ, int strideY, int strideX,\
int strideZ1, int strideY1, int strideX1,\
int dstStrideZ, int dstStrideY, int dstStrideX\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
int total = sizeZ * sizeY * sizeX;\
int ix = i % sizeX;\
int tmp = i / sizeX;\
int iy = tmp % sizeY;\
int iz = tmp / sizeY;\
int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
TIn x = input0[srcOffset];\
TIn y = input1[srcOffset1];\
output[dstOffset] = (TOut)Func;\
}\
}\
#define BINARY_FUNC_FLOATMID(Name, Func)\
template<typename TIn, typename TOut>\
__global__ void BinaryMid##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int sizeZ, int sizeY, int sizeX,\
int strideZ, int strideY, int strideX,\
int strideZ1, int strideY1, int strideX1,\
int dstStrideZ, int dstStrideY, int dstStrideX\
) { \
int count = sizeZ * sizeY * sizeX;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
int total = sizeZ * sizeY * sizeX;\
int ix = i % sizeX;\
int tmp = i / sizeX;\
int iy = tmp % sizeY;\
int iz = tmp / sizeY;\
int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
float x = input0[srcOffset];\
float y = input1[srcOffset1];\
output[dstOffset] = (TOut)(Func);\
}\
}\
template<typename TIn, typename TOut>\
__global__ void BinaryMidLinear##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int sizeZ,\
int strideZ,\
int strideZ1,\
int dstStrideZ\
) { \
int count = sizeZ;\
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
int iz = i;\
int srcOffset = iz * strideZ;\
int srcOffset1 = iz * strideZ1;\
int dstOffset = iz * dstStrideZ;\
float x = input0[srcOffset];\
float y = input1[srcOffset1];\
output[dstOffset] = (TOut)(Func);\
}\
}\
#define BINARY_FUNC_FLOATMID4(Name, Func)\
template<typename TIn, typename TOut>\
__global__ void BinaryMidLinear4_##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int count_4\
) { \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
int iz = i;\
int srcOffset = iz << 2;\
int srcOffset1 = iz << 2;\
int dstOffset = iz << 2;\
float4 xx = ((float4 *)(input0+srcOffset))[0];\
float4 yy = ((float4 *)(input1+srcOffset1))[0];\
float x = xx.x;\
float y = yy.x;\
output[dstOffset] = (TOut)(Func);\
x = xx.y;\
y = yy.y;\
output[dstOffset+1] = (TOut)(Func);\
x = xx.z;\
y = yy.z;\
output[dstOffset+2] = (TOut)(Func);\
x = xx.w;\
y = yy.w;\
output[dstOffset+3] = (TOut)(Func);\
}\
}\
template<typename TIn, typename TOut>\
__global__ void BinaryMidLinearHalf4_##Name(\
const TIn *input0, const TIn* input1, TOut *output,\
int count_4\
) { \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
int iz = i;\
int srcOffset = iz << 2;\
int srcOffset1 = iz << 2;\
int dstOffset = iz << 2;\
half2 xx = ((half2 *)(input0+srcOffset))[0];\
half2 yy = ((half2 *)(input1+srcOffset1))[0];\
float x = (float)xx.x;\
float y = (float)yy.x;\
output[dstOffset] = (TOut)(Func);\
x = (float)xx.y;\
y = (float)yy.y;\
output[dstOffset+1] = (TOut)(Func);\
xx = ((half2 *)(input0+srcOffset))[1];\
yy = ((half2 *)(input1+srcOffset1))[1];\
x = (float)xx.x;\
y = (float)yy.x;\
output[dstOffset+2] = (TOut)(Func);\
x = (float)xx.y;\
y = (float)yy.y;\
output[dstOffset+3] = (TOut)(Func);\
}\
}\
#define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0))
@ -398,44 +677,107 @@ BINARY_FUNC(FLOORMOD, x - floor(x / y) * y);
BINARY_FUNC(SquaredDifference, (x-y)*(x-y));
BINARY_FUNC(POW, pow(x, y));
BINARY_FUNC(ATAN2, atan2(x, y));
BINARY_FUNC(MOD, x - x / y);
BINARY_FUNC(MOD, (x % y));
BINARY_FUNC(LOGICALOR, (x || y) ? 1 : 0);
void BinaryBlitTemplateFloat(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
BINARY_FUNC_FLOATMID(ADD, x+y);
BINARY_FUNC_FLOATMID(SUB, x-y);
BINARY_FUNC_FLOATMID(MUL, x*y);
BINARY_FUNC_FLOATMID(DIV, x/y);
BINARY_FUNC_FLOATMID(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
BINARY_FUNC_FLOATMID(MINIMUM, min(x, y));
BINARY_FUNC_FLOATMID(MAXIMUM, max(x, y));
BINARY_FUNC_FLOATMID(GREATER, x > y ? 1 : 0);
BINARY_FUNC_FLOATMID(LESS, x < y ? 1 : 0);
BINARY_FUNC_FLOATMID(LESS_EQUAL, x <= y ? 1 : 0);
BINARY_FUNC_FLOATMID(GREATER_EQUAL, x >= y ? 1 : 0);
BINARY_FUNC_FLOATMID(EQUAL, x == y ? 1 : 0);
BINARY_FUNC_FLOATMID(NOTEQUAL, x != y ? 1 : 0);
BINARY_FUNC_FLOATMID(FLOORDIV, floor(x / y));
BINARY_FUNC_FLOATMID(FLOORMOD, x - floor(x / y) * y);
BINARY_FUNC_FLOATMID(SquaredDifference, (x-y)*(x-y));
BINARY_FUNC_FLOATMID(POW, pow(x, y));
BINARY_FUNC_FLOATMID(ATAN2, atan2(x, y));
BINARY_FUNC_FLOATMID(MOD, fmod(x, y));
BINARY_FUNC_FLOATMID(LOGICALOR, (x || y) ? 1 : 0);
BINARY_FUNC_FLOATMID4(ADD, x+y);
BINARY_FUNC_FLOATMID4(SUB, x-y);
BINARY_FUNC_FLOATMID4(MUL, x*y);
BINARY_FUNC_FLOATMID4(DIV, x/y);
BINARY_FUNC_FLOATMID4(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
BINARY_FUNC_FLOATMID4(MINIMUM, min(x, y));
BINARY_FUNC_FLOATMID4(MAXIMUM, max(x, y));
BINARY_FUNC_FLOATMID4(GREATER, x > y ? 1 : 0);
BINARY_FUNC_FLOATMID4(LESS, x < y ? 1 : 0);
BINARY_FUNC_FLOATMID4(LESS_EQUAL, x <= y ? 1 : 0);
BINARY_FUNC_FLOATMID4(GREATER_EQUAL, x >= y ? 1 : 0);
BINARY_FUNC_FLOATMID4(EQUAL, x == y ? 1 : 0);
BINARY_FUNC_FLOATMID4(NOTEQUAL, x != y ? 1 : 0);
BINARY_FUNC_FLOATMID4(FLOORDIV, floor(x / y));
BINARY_FUNC_FLOATMID4(FLOORMOD, x - floor(x / y) * y);
BINARY_FUNC_FLOATMID4(SquaredDifference, (x-y)*(x-y));
BINARY_FUNC_FLOATMID4(POW, pow(x, y));
BINARY_FUNC_FLOATMID4(ATAN2, atan2(x, y));
BINARY_FUNC_FLOATMID4(MOD, fmod(x, y));
BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0);
template<typename T>
void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
int count = size[0] * size[1] * size[2];
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
// TODO: Support FP16
MNN_ASSERT(bytes==4);
#define COMPUTE_FLOAT(TYPE, TOut)\
if (opType == MNN::BinaryOpOperation_##TYPE ) {\
Binary##TYPE<<<block_num, threads_num>>>((const float*)input, (const float*)(input1), (TOut*)output,\
size[0], size[1], size[2],\
srcStride[0], srcStride[1], srcStride[2],\
srcStride1[0], srcStride1[1], srcStride1[2],\
dstStride[0], dstStride[1], dstStride[2]);\
return;\
}\
if (opType == MNN::BinaryOpOperation_##TYPE ) {\
if (size[2] == count) {\
if(count % 4 == 0 && count > 16384 && srcStride[2] == 1 && srcStride1[2] == 1 && dstStride[2] == 1) {\
block_num = runtime->blocks_num(count/4);\
threads_num = runtime->threads_num();\
if(bytes == 4) {\
BinaryMidLinear4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
count/4);\
} else {\
BinaryMidLinearHalf4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
count/4);\
}\
} else {\
BinaryMidLinear##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
size[2],\
srcStride[2],\
srcStride1[2],\
dstStride[2]);\
}\
} else {\
BinaryMid##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
size[0], size[1], size[2],\
srcStride[0], srcStride[1], srcStride[2],\
srcStride1[0], srcStride1[1], srcStride1[2],\
dstStride[0], dstStride[1], dstStride[2]);\
}\
return;\
}\
COMPUTE_FLOAT(ADD, float);
COMPUTE_FLOAT(SUB, float);
COMPUTE_FLOAT(MUL, float);
COMPUTE_FLOAT(DIV, float);
COMPUTE_FLOAT(REALDIV, float);
COMPUTE_FLOAT(MINIMUM, float);
COMPUTE_FLOAT(MAXIMUM, float);
COMPUTE_FLOAT(ADD, T);
COMPUTE_FLOAT(SUB, T);
COMPUTE_FLOAT(MUL, T);
COMPUTE_FLOAT(DIV, T);
COMPUTE_FLOAT(REALDIV, T);
COMPUTE_FLOAT(MINIMUM, T);
COMPUTE_FLOAT(MAXIMUM, T);
COMPUTE_FLOAT(GREATER, int);
COMPUTE_FLOAT(LESS, int);
COMPUTE_FLOAT(LESS_EQUAL, int);
COMPUTE_FLOAT(GREATER_EQUAL, int);
COMPUTE_FLOAT(EQUAL, int);
COMPUTE_FLOAT(NOTEQUAL, int);
COMPUTE_FLOAT(FLOORDIV, float);
COMPUTE_FLOAT(FLOORMOD, float);
COMPUTE_FLOAT(POW, float);
COMPUTE_FLOAT(SquaredDifference, float);
COMPUTE_FLOAT(ATAN2, float);
COMPUTE_FLOAT(MOD, float);
COMPUTE_FLOAT(FLOORDIV, T);
COMPUTE_FLOAT(FLOORMOD, T);
COMPUTE_FLOAT(POW, T);
COMPUTE_FLOAT(SquaredDifference, T);
COMPUTE_FLOAT(ATAN2, T);
COMPUTE_FLOAT(MOD, T);
#undef COMPUTE_FLOAT
}
void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
@ -472,12 +814,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) {
if (type.code == halide_type_float) {
BinaryBlitTemplateFloat(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
if (type.bits == 32) {
BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
} else if (type.bits == 16) {
BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
}
} else if (type.code == halide_type_int) {
BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
}
}
}// namespace CUDA
}// namespace MNN

View File

@ -6,11 +6,22 @@ namespace MNN {
namespace CUDA {
void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime);
void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime);
void PackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
void UnpackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime);
void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType);
void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType);
// Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst
struct FuseRegion {
int32_t size[3] = {1, 1, 1};
int32_t srcStride[3] = {0, 0, 0};
int32_t dstStride[3] = {0, 0, 0};
int fuseNumber = 0;
};
void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes);
}
}

View File

@ -2,35 +2,305 @@
// RasterExecution.cpp
// MNN
//
// Created by MNN on 2020/07/30.
// Created by MNN on b'2020/04/02'.
// Copyright © 2018, Alibaba Group Holding Limited
//
#include "RasterExecution.hpp"
#include "Raster.cuh"
#include "core/Concurrency.h"
#include "core/OpCommonUtils.hpp"
#include "core/BufferAllocator.hpp"
#include "Raster.cuh"
#include "Transpose.cuh"
#include "MNNCUDADefine.hpp"
namespace MNN {
namespace CUDA {
ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) {
batch = t->batch();
if (t->dimensions() == 4) {
channel = t->channel();
area = t->width() * t->height();
} else if (t->dimensions() == 3) {
auto format = TensorUtils::getDescribe(t)->dimensionFormat;
if (format == MNN_DATA_FORMAT_NHWC) {
channel = t->length(2);
area = t->length(1);
} else {
channel = t->length(1);
area = t->length(2);
}
} else {
auto format = TensorUtils::getDescribe(t)->dimensionFormat;
if (format == MNN_DATA_FORMAT_NHWC) {
for (int i = t->dimensions() - 1; i > 0; i--) {
int len = t->length(i);
if (len > 1) {
if (channel == 1) {
channel = len;
} else {
area *= len;
}
}
}
} else {
for (int i = 1; i < t->dimensions(); i++) {
int len = t->length(i);
if (len > 1) {
if (channel == 1) {
channel = len;
} else {
area *= len;
}
}
}
}
}
}
// Detect if the region is a transpose
static bool _transpose(const Tensor::InsideDescribe::Region& region) {
int srcOne = -1, dstOne = -1;
for (int i = 0; i < 3; i++) {
if (region.src.stride[i] == 1 && region.size[i] != 1) {
if (srcOne >= 0 || region.size[i] < 4) {
return false;
}
srcOne = i;
}
if (region.dst.stride[i] == 1 && region.size[i] != 1) {
if (dstOne >= 0 || region.size[i] < 4) {
return false;
}
dstOne = i;
}
}
return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne;
}
static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
auto origin = region.origin;
auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
if (srcFormat == dstFormat) {
return 0;
}
if (0 != region.src.offset || 0 != region.dst.offset) {
return 0;
}
int dstBatch = 1, dstChannel = 1, dstArea = 1,
srcBatch = 1, srcChannel = 1, srcArea = 1;
getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
if (dstBatch != srcBatch) {
return 0;
}
if (dstChannel != srcChannel) {
return 0;
}
if (dstArea != srcArea) {
return 0;
}
auto totalSize = dstBatch * dstChannel * dstArea;
int srcSize = 1;
int dstSize = 1;
int res = 1;
for (int i=0; i<3; ++i) {
if (region.size[i] == 1) {
continue;
}
if (region.src.stride[i] != region.dst.stride[i]) {
if (dstArea == 1) {
// Batch / Channel transpose
return 0;
}
res = 2;
}
srcSize += (region.size[i] - 1) * region.src.stride[i];
dstSize += (region.size[i] - 1) * region.dst.stride[i];
}
if (srcSize != totalSize || dstSize != totalSize ) {
return 0;
}
// Check If it can be described as NHWC <-> NC4HW4 transpose
if (2 == res) {
int srcChannelStride;
int dstChannelStride;
int srcAreaStride;
int dstAreaStride;
if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) {
srcChannelStride = srcArea;
srcAreaStride = 1;
dstChannelStride = 1;
dstAreaStride = srcChannel;
} else {
srcChannelStride = 1;
srcAreaStride = srcChannel;
dstAreaStride = 1;
dstChannelStride = srcArea;
}
for (int i=0; i<3; ++i) {
if (region.size[i] == 1) {
continue;
}
if (region.size[i] == dstBatch) {
if (region.src.stride[i] != region.dst.stride[i]) {
return 0;
}
continue;
}
if (region.size[i] == srcChannel) {
if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) {
return 0;
}
}
if (region.size[i] == srcArea) {
if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) {
return 0;
}
}
}
return 2;
}
return 1;
}
ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(inputs.size() == 1);
MNN_ASSERT(outputs.size() == 1);
auto input = inputs[0];
auto output = outputs[0];
auto des = TensorUtils::getDescribe(input);
auto input = inputs[0];
auto output = outputs[0];
auto des = TensorUtils::getDescribe(input);
auto outputDes = TensorUtils::getDescribe(output);
mNeedZero = !TensorUtils::regionIsFull(input);
mTempInputCopy.clear();
mNeedZero = !TensorUtils::regionIsFull(input);
mZeroPoint = 0;
mTempInput.clear();
mFastBlit.clear();
mFuseRaster.first = false;
if(des->regions.size() > 1) {
mFuseRaster.first = true;
mFuseRaster.second = des->regions.size();
auto& slice0 = des->regions[0];
for (int i = 1; i < des->regions.size(); ++i) {
mTempOutput = nullptr;
auto midFormat = MNN_DATA_FORMAT_NCHW;
mTempInputCopy.clear();
mOutputPtr = output;
mFast = false;
int pack = PACK_NUMBER;
// all_srcFormat == dstFormat == NC4HW4 : Fast Exe
if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
mFast = true;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (slice0.origin->deviceId() != slice.origin->deviceId()) {
if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
mFast = false;
break;
}
if (!OpCommonUtils::canBlitFast(slice, output, pack, true)) {
mFast = false;
break;
}
}
if (mFast) {
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (slice.origin == nullptr) {
continue;
}
Tensor::InsideDescribe::Region newRegion;
OpCommonUtils::turnToPackRegion(slice, newRegion, output, pack, true);
mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
}
return NO_ERROR;
}
}
mSingleConvert = 0;
// srcNum == 1 && srcFormat != dstFormat : Single Convert
if (des->regions.size() == 1) {
mSingleConvert = _singleConvert(des->regions[0], output);
if (mSingleConvert > 0) {
return NO_ERROR;
}
}
// Acquire Buffer for temp output
// TODO: optimize it
if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
mTempOutput.reset(new Tensor);
TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat);
}
if (nullptr != mTempOutput) {
auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
mOutputPtr = mTempOutput.get();
}
// input is NC4HW4 add Convert
std::vector<Tensor*> forRelease;
for (int i=0; i< des->regions.size(); ++i) {
auto& slice = des->regions[i];
auto origin = slice.origin;
if (slice.mask != 0) {
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
continue;
}
// if tensor is not NC4HW4 or has been merged, don't need deal
if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
continue;
}
// if NC4HW4's C%4 == 0, change convert to transpose and fuse it
if (origin->batch() == 1 && origin->channel() % pack == 0) {
int channel = origin->channel();
int area = 1;
// conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height
for (int d = 2; d < origin->dimensions(); d++) {
area *= origin->length(d);
}
Tensor::InsideDescribe::Region regionTmp;
regionTmp.src.offset = 0;
regionTmp.src.stride[0] = area * pack;
regionTmp.src.stride[1] = 1;
regionTmp.src.stride[2] = pack;
regionTmp.dst.offset = 0;
regionTmp.dst.stride[0] = area * pack;
regionTmp.dst.stride[1] = area;
regionTmp.dst.stride[2] = 1;
regionTmp.size[0] = channel / pack;
regionTmp.size[1] = pack;
regionTmp.size[2] = area;
regionTmp.origin = slice.origin;
bool merge = TensorUtils::fuseRegion(regionTmp, slice);
if (merge) {
// cache the merged tensor
slice.mask = 1;
mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
continue;
}
}
auto cache = static_cast<CUDABackend*>(backend())->getCache();
auto tempTensor = cache->findCacheTensor(origin, midFormat);
if (nullptr == tempTensor) {
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor.get()));
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
tempTensor = newTensor.get();
TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
cache->pushCacheTensor(newTensor, origin, midFormat);
}
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
forRelease.emplace_back(tempTensor);
}
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
}
if(mTempInputCopy.size() > 1) {
mFuseRaster.first = true;
mFuseRaster.second = mTempInputCopy.size();
auto& slice0 = *mTempInputCopy[0].second;
for (int i = 1; i < mTempInputCopy.size(); ++i) {
auto& slice = *mTempInputCopy[i].second;
if (mTempInputCopy[i].first != mTempInputCopy[0].first) {
mFuseRaster.first = false;
break;
}
@ -52,81 +322,141 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const st
}
}
}
//mFuseRaster.first = false;
if(!mFuseRaster.first) {
for (int i = 0; i < des->regions.size(); ++i) {
auto& slice = des->regions[i];
if (nullptr == slice.origin) {
continue;
}
mTempInputCopy.emplace_back(std::make_pair((void*)slice.origin->deviceId(), &slice));
}
} else {
auto& slice0 = des->regions[0];
if (nullptr != slice0.origin) {
mTempInputCopy.emplace_back(std::make_pair((void*)slice0.origin->deviceId(), &slice0));
}
int regionSize = des->regions.size();
if(mFuseRaster.first) {
auto& slice0 = *mTempInputCopy[0].second;
auto tensor = mTempInputCopy[0].first;
int regionSize = mTempInputCopy.size();
std::vector<int32_t> temp(2*regionSize, 0);
for (int i = 0; i < regionSize; ++i) {
auto& slice = des->regions[i];
auto& slice = *mTempInputCopy[i].second;
temp[i] = slice.src.offset;
temp[regionSize+i] = slice.dst.offset;
//printf("%d-", tmpSrc[i]);
//printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]);
}
//save srcOffset/dstOffset to Device
offsetTensor.reset(Tensor::createDevice<int32_t>({2*regionSize}));
backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC);
mOffset = (void *)offsetTensor.get()->buffer().device;
cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice));
mTempInputCopy.clear();
mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0));
}
for (auto t : forRelease) {
backend()->onReleaseBuffer(t, Backend::DYNAMIC);
}
if (nullptr != mTempOutput) {
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
return NO_ERROR;
}
ErrorCode RasterExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
void RasterExecution::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
auto bn = static_cast<CUDABackend*>(backend());
auto input = inputs[0];
auto output = outputs[0];
auto bytes = bn->getBytes(output);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
auto input = inputs[0];
auto output = outputs[0];
auto bytes = input->getType().bytes();
if (mNeedZero) {
runtime->memset((void*)output->deviceId(), 0, output->size());
auto size = static_cast<CUDABackend*>(backend())->realSize(output) * bytes;
cudaMemset((uint8_t*)output->deviceId(), 0, size);
}
// Use mFastBlit
for (auto& iter : mFastBlit) {
auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes;
auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes;
RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes * PACK_NUMBER, runtime);
}
}
ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
if (mFast) {
executeFaster(inputs, outputs);
return NO_ERROR;
}
auto bn = static_cast<CUDABackend*>(backend());
auto input = inputs[0];
auto output = outputs[0];
auto bytes = bn->getBytes(output);
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
if (mSingleConvert > 0) {
auto realInput = TensorUtils::getDescribe(input)->regions[0].origin;
int srcBatch = 1, srcChannel = 1, srcArea = 1;
getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea);
auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat;
auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat;
int batchStride = srcChannel * srcArea * bytes;
int inputBatchStride = batchStride;
int outputBatchStride = batchStride;
PackInfo pack;
pack.inside = srcArea;
pack.axis = srcChannel;
pack.unit = PACK_NUMBER;
pack.outside = srcBatch;
if (mSingleConvert == 1) {
pack.axisStride = srcArea;
pack.insideStride = 1;
} else if (mSingleConvert == 2) {
pack.axisStride = 1;
pack.insideStride = srcChannel;
}
auto srcPtr = (void*)realInput->deviceId();
auto dstPtr = (void*)output->deviceId();
if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) {
if (realInput->dimensions() <= 1) {
cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
return NO_ERROR;
}
UnpackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);
} else {
if (output->dimensions() <= 1) {
cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
return NO_ERROR;
}
PackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);
}
return NO_ERROR;
}
if (mNeedZero) {
auto size = static_cast<CUDABackend*>(backend())->realSize(mOutputPtr) * bytes;
cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size);
}
for (auto& iter : mTempInput) {
backend()->onCopyBuffer(iter.first, iter.second);
}
if(mFuseRaster.first) {
MNN_ASSERT(mTempInputCopy.size() == 1);
auto& iter = mTempInputCopy[0];
auto& slice = *(iter.second);
auto srcPtr = (uint8_t*)iter.first;
auto dstPtr = (uint8_t*)output->deviceId();
auto srcPtr = (uint8_t*)iter.first->deviceId();
auto dstPtr = (uint8_t*)mOutputPtr->deviceId();
//printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset);
FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime);
return NO_ERROR;
} else {
for (auto& iter : mTempInputCopy) {
auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second->src.offset * bytes;
auto dstPtr = (uint8_t*)mOutputPtr->deviceId() + iter.second->dst.offset * bytes;
RasterBlit(dstPtr, srcPtr, iter.second->size, iter.second->src.stride, iter.second->dst.stride, bytes, runtime);
}
}
for (int u = 0; u < mTempInputCopy.size(); ++u) {
auto& iter = mTempInputCopy[u];
auto& slice = *(iter.second);
auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
auto dstPtr = (uint8_t*)output->deviceId() + slice.dst.offset * bytes;
RasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, bytes, runtime);
if (nullptr != mTempOutput) {
backend()->onCopyBuffer(mTempOutput.get(), output);
}
return NO_ERROR;
}
RasterExecution::RasterExecution(Backend* backend) : Execution(backend) {
// Do nothing
}
RasterExecution::~RasterExecution() {
// Do nothing
}
class RasterCreator : public CUDABackend::Creator {
class RasterExecutionFactory : public CUDABackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const override {
const MNN::Op* op, Backend* backend) const {
return new RasterExecution(backend);
}
};
static CUDACreatorRegister<RasterCreator> __init(OpType_Raster);
} // namespace CUDA
} // namespace MNN
static CUDACreatorRegister<RasterExecutionFactory> __init(OpType_Raster);
}
}

View File

@ -2,37 +2,43 @@
// RasterExecution.hpp
// MNN
//
// Created by MNN on 2020/07/30.
// Created by MNN on b'2020/04/02'.
// Copyright © 2018, Alibaba Group Holding Limited
//
#ifndef RasterExecution_hpp
#define RasterExecution_hpp
#include <map>
#include <memory>
#include <vector>
#include "backend/cuda/core/CUDABackend.hpp"
#include "core/Execution.hpp"
#include <map>
#include <set>
#include "core/TensorUtils.hpp"
namespace MNN {
namespace CUDA {
class RasterExecution : public Execution {
public:
RasterExecution(Backend *backend);
virtual ~RasterExecution();
RasterExecution(Backend* bn) : Execution(bn) {
// Do nothing
}
virtual ~ RasterExecution() {
// Do nothing
}
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
void executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const;
private:
std::vector<std::pair<void *, Tensor::InsideDescribe::Region *>> mTempInputCopy;
std::map<Tensor*, Tensor*> mTempInput;
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
std::shared_ptr<Tensor> mTempOutput;
Tensor* mOutputPtr;
bool mNeedZero = false;
bool mFast = false;
int mSingleConvert = 0;
int32_t mZeroPoint = 0;
std::pair<bool, int> mFuseRaster;
void *mOffset;
std::shared_ptr<Tensor> offsetTensor;
};
} // namespace CUDA
} // namespace MNN
}
}
#endif

View File

@ -1,99 +1,19 @@
#include "ReductionExecution.hpp"
namespace MNN {
namespace CUDA {
ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
mType = opType;
mAxis = axis;
auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
mParam = staticPool->alloc(sizeof(ReduceParam));
}
ReductionExecution::~ ReductionExecution() {
// Do nothing
auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
staticPool->free(mParam);
}
template <typename T>
__global__ void SUM(const T *input, T *output, int inside, int axis, int outside) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
T sumValue = (T)0;
const T* basicInput = input + y * axis * inside + x;
for (int v=0; v<axis; ++v) {
sumValue += basicInput[v * inside];
}
output[y * inside + x] = sumValue;
}
return;
}
template <typename T>
__global__ void MEAN(const T *input, T *output, int inside, int axis, int outside) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
T sumValue = (T)0;
const T* basicInput = input + y * axis * inside + x;
for (int v=0; v<axis; ++v) {
sumValue += basicInput[v * inside];
}
output[y * inside + x] = sumValue / (T)axis;
}
return;
}
template <typename T>
__global__ void MINIMUM(const T *input, T *output, int inside, int axis, int outside) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
const T* basicInput = input + y * axis * inside + x;
T res = basicInput[0];
for (int v=1; v<axis; ++v) {
res = min(basicInput[v * inside], res);
}
output[y * inside + x] = res;
}
return;
}
template <typename T>
__global__ void MAXIMUM(const T *input, T *output, int inside, int axis, int outside) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
const T* basicInput = input + y * axis * inside + x;
T res = basicInput[0];
for (int v=1; v<axis; ++v) {
res = max(basicInput[v * inside], res);
}
output[y * inside + x] = res;
}
return;
}
template <typename T>
__global__ void PROD(const T *input, T *output, int inside, int axis, int outside) {
int count = inside * outside;
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
int y = i / inside;
int x = i % inside;
const T* basicInput = input + y * axis * inside + x;
T res = basicInput[0];
for (int v=1; v<axis; ++v) {
res *= basicInput[v * inside];
}
output[y * inside + x] = res;
}
return;
}
ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = (void*)inputs[0]->deviceId();
auto output = (void*)outputs[0]->deviceId();
ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int inside = 1;
int outside = 1;
@ -104,52 +24,88 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
inside *= inputs[0]->length(i);
}
mCpuParam.inside = inside;
mCpuParam.outside = outside;
mCpuParam.axis = axis;
cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
return NO_ERROR;
}
ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
auto input = (void*)inputs[0]->deviceId();
auto output = (void*)outputs[0]->deviceId();
auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
int inside = mCpuParam.inside;;
int outside = mCpuParam.outside;
int count = inside * outside;
int block_num = runtime->blocks_num(count);
int threads_num = runtime->threads_num();
auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
if (inputs[0]->getType() == halide_type_of<float>()) {
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
return NO_ERROR;
if (static_cast<CUDABackend*>(backend())->useFp16()) {
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
return NO_ERROR;
}
} else {
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
return NO_ERROR;
}
}
MNN_ASSERT(false);
return NOT_SUPPORT;
}
MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
switch (mType) {
case ReductionType_MEAN:
MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_SUM:
SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_MINIMUM:
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_MAXIMUM:
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_PROD:
PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_ANY:
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
case ReductionType_ALL:
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
return NO_ERROR;
}
MNN_ASSERT(false);

Some files were not shown because too many files have changed in this diff Show More