From 0c718e552b2e9b3c6724cb25257c8b96f93d87ec Mon Sep 17 00:00:00 2001 From: xiaying Date: Fri, 18 Feb 2022 11:30:27 +0800 Subject: [PATCH] [Sync] Sync internal Gitlab --- CMakeLists.txt | 103 +- README.md | 12 +- README_CN.md | 14 +- express/CMakeLists.txt | 4 + express/Executor.cpp | 1 + express/Expr.cpp | 23 + express/MathOp.cpp | 8 + express/NeuralNetWorkOp.cpp | 68 ++ express/module/Module.cpp | 18 +- express/module/NMSModule.hpp | 2 +- include/MNN/ImageProcess.hpp | 13 + include/MNN/Interpreter.hpp | 8 +- include/MNN/expr/Executor.hpp | 2 +- include/MNN/expr/MathOp.hpp | 1 + include/MNN/expr/NeuralNetWorkOp.hpp | 10 +- package_scripts/linux/build_whl.sh | 4 +- package_scripts/mac/build_whl.sh | 8 +- package_scripts/win/build_bridge.ps1 | 236 +++-- package_scripts/win/build_lib.ps1 | 160 +-- package_scripts/win/build_tools.ps1 | 81 +- package_scripts/win/build_whl.ps1 | 45 +- project/ios/MNN.xcodeproj/project.pbxproj | 34 +- project/ios/Playground/AppDelegate.mm | 63 +- pymnn/CMakeLists.txt | 74 +- .../examples/MNNEngineDemo/mobilenet_demo.py | 2 +- pymnn/pip_package/MNN/cv/__init__.py | 41 + pymnn/pip_package/MNN/expr/__init__.py | 167 ++- pymnn/pip_package/MNN/numpy/__init__.py | 73 +- pymnn/pip_package/build_deps.py | 9 +- pymnn/pip_package/build_wheel.py | 6 + pymnn/pip_package/setup.py | 110 +- pymnn/src/MNN.cc | 225 ++++- pymnn/src/MNNPyBridge.h | 10 +- pymnn/src/cv.h | 177 +++- pymnn/src/expr.h | 242 ++++- pymnn/src/nn.h | 155 ++- pymnn/src/util.h | 309 +++--- pymnn/test/model_test.py | 7 +- pymnn/test/unit_test.py | 77 +- schema/current/UserDefine_generated.h | 28 +- schema/default/UserDefine.fbs | 1 + source/backend/cpu/CPUBackend.cpp | 2 +- source/backend/cpu/CPUImageProcess.cpp | 62 +- source/backend/cpu/CPUImageProcess.hpp | 12 +- source/backend/cpu/CPUNonMaxSuppressionV2.cpp | 3 + source/backend/cpu/CPUResizeCache.hpp | 9 +- source/backend/cpu/CPUScatterNd.cpp | 9 +- .../cpu/compute/ImageProcessFunction.cpp | 18 + .../cpu/compute/ImageProcessFunction.hpp | 4 + source/backend/cpu/x86_x64/CMakeLists.txt | 86 +- .../_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S | 27 +- ..._AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S | 27 +- .../_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S | 53 +- .../_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S | 50 +- .../x86_x64/avx512/SparseKernelFunction.hpp | 66 +- .../avx512/SparseKernelFunctionEpx1.cpp | 7 +- .../avx512/SparseKernelFunctionEpx4.cpp | 22 +- .../avx512/SparseKernelFunctionEpx8.cpp | 22 +- .../avx512/_AVX512_MNNGemmFloatUnit16x8.S | 29 +- .../avx512/_AVX512_MNNGemmFloatUnit32x8.S | 29 +- .../avx512/_AVX512_MNNGemmFloatUnit48x8.S | 30 +- .../_AVX512_MNNGemmFloatUnit48x8Fused.S | 56 +- .../_AVX512_MNNPackedSparseMatMulEpx4.S | 90 +- .../x86_x64/avx512/_AVX512_TransposeMain.S | 24 +- .../avxfma/_AVX_MNNGemmFloatUnitMainFMA.S | 25 +- .../avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S | 25 +- .../_AVX_MNNGemmFloatUnitMainFMA_Fused.S | 49 + .../_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S | 51 +- .../_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S | 51 +- source/backend/cuda/CMakeLists.txt | 6 +- source/backend/cuda/core/CUDABackend.cpp | 401 +++++++- source/backend/cuda/core/CUDABackend.hpp | 10 +- .../backend/cuda/core/runtime/CUDARuntime.cpp | 60 +- .../backend/cuda/core/runtime/CUDARuntime.hpp | 56 +- .../cuda/execution/BatchMatMulExecution.cu | 119 --- .../cuda/execution/BatchMatMulExecution.hpp | 23 - .../backend/cuda/execution/BinaryExecution.cu | 7 +- .../cuda/execution/ConvDepthWiseExecution.cu | 584 ++++++----- .../cuda/execution/ConvDepthWiseExecution.hpp | 33 +- .../execution/ConvSingleInputExecution.cu | 203 ++-- .../execution/ConvSingleInputExecution.hpp | 6 +- .../execution/DeconvSingleInputExecution.cu | 450 +++++---- .../execution/DeconvSingleInputExecution.hpp | 99 +- source/backend/cuda/execution/ImageColumn.cu | 705 +++++++++++++ source/backend/cuda/execution/ImageColumn.cuh | 24 + .../backend/cuda/execution/InterpExecution.cu | 156 ++- .../cuda/execution/LayerNormExecution.cu | 61 +- .../{CUDALoop.cpp => LoopExecution.cpp} | 60 +- .../backend/cuda/execution/MNNCUDADefine.hpp | 18 + .../cuda/execution/MNNCUDAFunction.cuh | 38 + .../backend/cuda/execution/MatMulExecution.cu | 83 +- .../cuda/execution/MatMulExecution.hpp | 1 + .../backend/cuda/execution/PReLUExecution.cu | 43 +- .../backend/cuda/execution/PReLUExecution.hpp | 4 +- .../backend/cuda/execution/PoolExecution.cu | 257 ++++- source/backend/cuda/execution/Raster.cu | 687 +++++++++---- source/backend/cuda/execution/Raster.cuh | 15 +- .../cuda/execution/RasterExecution.cpp | 448 +++++++-- .../cuda/execution/RasterExecution.hpp | 36 +- .../cuda/execution/ReductionExecution.cu | 172 ++-- .../cuda/execution/ReductionExecution.hpp | 4 + .../cuda/execution/ReductionTemplate.cuh | 93 ++ .../backend/cuda/execution/ScaleExecution.cu | 60 +- .../backend/cuda/execution/ScaleExecution.hpp | 6 +- .../backend/cuda/execution/SelectExecution.cu | 7 +- .../cuda/execution/SoftmaxExecution.cu | 159 ++- .../cuda/execution/SoftmaxExecution.hpp | 16 +- .../backend/cuda/execution/TensorCoreGemm.cu | 219 +++- .../backend/cuda/execution/TensorCoreGemm.cuh | 15 +- .../cuda/execution/TensorCoreGemmPacked.cu | 184 ++++ .../cuda/execution/TensorCoreGemmPacked.cuh | 8 + source/backend/cuda/execution/Transpose.cu | 291 ++++++ source/backend/cuda/execution/Transpose.cuh | 44 + .../backend/cuda/execution/UnaryExecution.cu | 94 +- source/backend/metal/MetalBackend.hpp | 4 +- source/backend/metal/MetalBackend.mm | 4 +- .../opencl/core/runtime/OpenCLWrapper.cpp | 10 +- .../vulkan/component/VulkanMemoryPool.cpp | 4 +- source/common/WinogradInt8Helper.hpp | 2 +- source/core/BufferAllocator.cpp | 20 +- source/core/BufferAllocator.hpp | 22 +- source/core/Interpreter.cpp | 76 +- source/cv/ImageProcess.cpp | 25 +- source/geometry/GeometryGather.cpp | 4 +- source/geometry/GeometryOPRegister.cpp | 2 - source/geometry/GeometrySelect.cpp | 2 +- source/geometry/GeometryShape.cpp | 43 + source/geometry/GeometryStridedSlice.cpp | 26 + source/shape/ShapeRegister.cpp | 2 + source/shape/ShapeReshape.cpp | 4 +- source/shape/ShapeResize.cpp | 8 +- source/shape/ShapeScatterNd.cpp | 2 +- source/shape/ShapeShape.cpp | 28 + source/shape/ShapeStridedSlice.cpp | 9 +- source/shape/ShapeWhere.cpp | 7 +- test.bat | 7 + test.ps1 | 233 +++++ test/CMakeLists.txt | 3 + test/MNNTestSuite.cpp | 8 +- test/MNNTestSuite.h | 5 +- test/core/BackendTest.cpp | 105 +- test/core/BufferAllocatorTest.cpp | 3 +- test/expr/MatMulTest.cpp | 2 +- test/expr/ZeroShapeTest.cpp | 4 +- test/main.cpp | 6 +- test/op/RasterTest.cpp | 43 + test/op/SelectTest.cpp | 19 +- test/op/SortTest.cpp | 92 ++ test/op/StridedSliceTest.cpp | 20 + test/op/UnaryTest.cpp | 2 +- tools/converter/CMakeLists.txt | 20 +- tools/converter/source/onnx/IfOnnx.cpp | 12 +- tools/converter/source/onnx/LoopOnnx.cpp | 4 + tools/converter/source/onnx/onnxConverter.cpp | 4 +- tools/converter/source/optimizer/Program.cpp | 4 + .../source/optimizer/merge/ConvBiasAdd.cpp | 2 +- .../optimizer/merge/ConvertMatMulToConv2D.cpp | 13 +- .../source/optimizer/merge/MergeHelpers.cpp | 3 + .../optimizer/merge/TensorConverterMerge.cpp | 2 +- .../source/optimizer/onnxextra/OnnxClip.cpp | 30 +- .../optimizer/onnxextra/OnnxLSTMMerge.cpp | 16 +- .../onnxextra/OnnxNonMaxSuppression.cpp | 15 +- .../onnxextra/OnnxSequenceGRUMerge.cpp | 13 +- .../source/optimizer/passes/Pass.hpp | 1 + .../source/optimizer/passes/PassRegistry.cpp | 8 - .../postconvert/AddTensorFormatConverter.cpp | 3 + .../optimizer/postconvert/ReIndexTensor.cpp | 6 + tools/cpp/MNNV2Basic.cpp | 2 + tools/cpp/backendTest.cpp | 6 +- tools/cpp/testModelWithDescrisbe.cpp | 8 +- tools/cv/CMakeLists.txt | 12 +- tools/cv/include/cv/imgproc/draw.hpp | 8 +- tools/cv/include/cv/imgproc/geometric.hpp | 7 +- tools/cv/include/cv/imgproc/structural.hpp | 12 +- tools/cv/include/cv/types.hpp | 50 +- tools/cv/source/imgcodecs/imgcodecs.cpp | 15 +- tools/cv/source/imgproc/color.cpp | 5 +- tools/cv/source/imgproc/draw.cpp | 951 +++++++++++++++++- tools/cv/source/imgproc/filter.cpp | 2 +- tools/cv/source/imgproc/geometric.cpp | 78 +- tools/cv/source/imgproc/structural.cpp | 119 ++- tools/cv/test/imgcodecs/codecs_test.cpp | 1 - tools/cv/test/imgproc/color_test.cpp | 1 - tools/cv/test/imgproc/draw_test.cpp | 101 +- tools/cv/test/imgproc/filter_test.cpp | 1 - tools/cv/test/imgproc/geometric_test.cpp | 1 - tools/cv/test/imgproc/miscellaneous_test.cpp | 1 - tools/cv/test/imgproc/structral_test.cpp | 47 +- tools/cv/test/test_env.hpp | 10 + tools/quantization/calibration.cpp | 2 +- tools/script/formatLicence.py | 2 +- tools/script/modelTest.py | 2 +- tools/train/source/nn/NN.cpp | 4 + 193 files changed, 9361 insertions(+), 2733 deletions(-) delete mode 100644 source/backend/cuda/execution/BatchMatMulExecution.cu delete mode 100644 source/backend/cuda/execution/BatchMatMulExecution.hpp create mode 100644 source/backend/cuda/execution/ImageColumn.cu create mode 100644 source/backend/cuda/execution/ImageColumn.cuh rename source/backend/cuda/execution/{CUDALoop.cpp => LoopExecution.cpp} (88%) create mode 100644 source/backend/cuda/execution/MNNCUDADefine.hpp create mode 100644 source/backend/cuda/execution/MNNCUDAFunction.cuh create mode 100644 source/backend/cuda/execution/ReductionTemplate.cuh create mode 100644 source/backend/cuda/execution/TensorCoreGemmPacked.cu create mode 100644 source/backend/cuda/execution/TensorCoreGemmPacked.cuh create mode 100644 source/backend/cuda/execution/Transpose.cu create mode 100644 source/backend/cuda/execution/Transpose.cuh create mode 100644 test.bat create mode 100644 test.ps1 create mode 100644 test/op/RasterTest.cpp create mode 100644 test/op/SortTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2aaa9656..b7fe8136 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,9 +24,14 @@ add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}") add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}") add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}") -# CMP0048 is related to letting CMake managing the package version for us - -cmake_policy(SET CMP0048 NEW) +# Clear VERSION variables when no VERSION is given to project() +if(POLICY CMP0048) + cmake_policy(SET CMP0048 NEW) +endif() +# MSVC runtime library flags are selected by an abstraction. +if(POLICY CMP0091) + cmake_policy(SET CMP0091 NEW) +endif() project(MNN VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD} LANGUAGES C CXX ASM) # complier options set(CMAKE_C_STANDARD 99) @@ -35,14 +40,6 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_LIST_DIR}/cmake" ) -#add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h" -# COMMAND ${CMAKE_COMMAND} "-DNAMES=MNN" -# "-DMNN_SOURCE_DIR=${CMAKE_CURRENT_LIST_DIR}" -# "-DHEADER_FILE=${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h" -# -P "${CMAKE_CURRENT_LIST_DIR}/cmake/GenerateVersionFromVCS.cmake" -# COMMENT "Generating Version Control Info" -#) -#add_custom_target (GenVCSHDR DEPENDS "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h") # Required for OpenCL/OpenGL/Vulkan CodeGen include(FindPythonInterp REQUIRED) # build options @@ -107,8 +104,8 @@ IF(WIN32) SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi") - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819") - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819") + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101") ENDIF() ENDIF() @@ -118,13 +115,54 @@ IF( MNN_ENABLE_COVERAGE) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage") ENDIF() +# do this before protobuf, make sure wincrt config of protobuf and MNN is same +if(MSVC) + # same as protobuf, otherwise config is inconsistent + if(CMAKE_VERSION VERSION_GREATER 3.15 OR CMAKE_VERSION VERSION_EQUAL 3.15) + set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded$<$:Debug>) + if(NOT MNN_WIN_RUNTIME_MT) + set(CMAKE_MSVC_RUNTIME_LIBRARY ${CMAKE_MSVC_RUNTIME_LIBRARY}DLL) + endif() + else() + foreach(flag_var + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if (MNN_WIN_RUNTIME_MT) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif() + else () + if(${flag_var} MATCHES "/MT") + string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}") + endif() + endif () + endforeach() + endif() + set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS}) +endif() + include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake) IF(MNN_BUILD_PROTOBUFFER) IF(MNN_BUILD_CONVERTER) + IF(MSVC) + set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS}) + IF((NOT MNN_BUILD_SHARED_LIBS) AND (NOT MNN_WIN_RUNTIME_MT)) + message(FATAL_ERROR "When MNN_BUILD_CONVERTER=ON and MNN_BUILD_SHARED_LIBS=OFF, MNN_WIN_RUNTIME_MT must be ON. Because protobuf not support the config(static /MD)") + ENDIF() + ENDIF() add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/3rd_party/protobuf/cmake) ENDIF() ENDIF() +# specify source file encoding explicitly, fix cross-platform garbled output issue +# we need do this after protobuf which set different execution-charset +IF(MSVC) + set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8") +ENDIF() + IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32)) SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") SET(MNN_SEP_BUILD OFF CACHE BOOL "" FORCE) @@ -206,26 +244,7 @@ message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}") message(STATUS "\tHidden: ${MNN_HIDDEN}") message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}") -if(MSVC) - if(${CMAKE_VERSION} VERSION_LESS "3.14.0") - message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!") - endif() - foreach(flag_var - CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE - CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO - CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE - CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) - if (MNN_WIN_RUNTIME_MT) - if(${flag_var} MATCHES "/MD") - string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif() - else () - if(${flag_var} MATCHES "/MT") - string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}") - endif() - endif () - endforeach() -elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux") +if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux") add_definitions(-fPIC) endif() if(CMAKE_SYSTEM_NAME MATCHES "^Android") @@ -561,6 +580,9 @@ if (MNN_INTERNAL) target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED) include(${CMAKE_CURRENT_LIST_DIR}/source/internal/auth/CMakeLists.txt) include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt) + if(CMAKE_SYSTEM_NAME MATCHES "^Linux") + list(APPEND MNN_EXTRA_DEPENDS "-lcurl -lssl -lcrypto") + endif() endif() # Train @@ -661,7 +683,18 @@ if(APPLE) endif() add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCPU) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter) +IF(WIN32 AND MNN_BUILD_CONVERTER AND MNN_BUILD_SHARED_LIBS) +# Because of dllimport/dllexport, we merge MNN and MNNConvertDeps together, which depend protobuf + target_link_libraries(MNN PUBLIC ${Protobuf_LIBRARIES}) +ENDIF() +# Merge MNN/MNNExpress/MNNOpenCV and other backends into one .lib/.dll on Windows add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/cv) +IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD) + IF(MSVC) + target_compile_definitions(MNNOpenCV PRIVATE "-DBUILDING_MNN_DLL" INTERFACE "-DUSING_MNN_DLL") + ENDIF() + target_sources(MNN PRIVATE $) +ENDIF() if(CMAKE_SYSTEM_NAME MATCHES "^Linux") # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread @@ -753,6 +786,10 @@ ELSE() ARCHIVE DESTINATION lib FRAMEWORK DESTINATION /Library/Frameworks/ ) + if (NOT MNN_AAPL_FMWK) + INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/) + INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/) + endif() FOREACH(HDR ${MNN_EXPR_PUB_HDRS}) SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/expr/ ) ENDFOREACH() diff --git a/README.md b/README.md index 7390a09f..11fccb33 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,17 @@ Interpreter consists of Engine and Backends. The former is responsible for the l Scan the following QR codes to join Dingtalk discussion group. The group discussions are predominantly Chinese. But we welcome and will help English speakers. -See https://www.yuque.com/mnn/cn/feedback for dingtalk group barcodes. +Group #1 (Full): + + + +Group #2 (Full): + + + +Group #3: + + ## License Apache 2.0 diff --git a/README_CN.md b/README_CN.md index a857ad24..7ed22ca1 100644 --- a/README_CN.md +++ b/README_CN.md @@ -56,7 +56,19 @@ Converter由Frontends和Graph Optimize构成。前者负责支持不同的训练 Interpreter由Engine和Backends构成。前者负责模型的加载、计算图的调度;后者包含各计算设备下的内存分配、Op实现。在Engine和Backends中,MNN应用了多种优化方案,包括在卷积和反卷积中应用Winograd算法、在矩阵乘法中应用Strassen算法、低精度计算、Neon优化、手写汇编、多线程优化、内存复用、异构计算等。 ## 社区交流与反馈 -扫描二维码加入钉钉讨论群,见:https://www.yuque.com/mnn/cn/feedback +扫描二维码加入钉钉讨论群。 + +一群(已满): + + + +二群(已满): + + + +三群: + + ## License Apache 2.0 diff --git a/express/CMakeLists.txt b/express/CMakeLists.txt index 0b2c3ffd..190c18ca 100644 --- a/express/CMakeLists.txt +++ b/express/CMakeLists.txt @@ -18,6 +18,10 @@ IF(MNN_SEP_BUILD) add_library(MNN_Express SHARED ${MNN_EXPR_SRCS}) endif() target_link_libraries(MNN_Express MNN) + install(TARGETS MNN_Express + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + ) ELSE() add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS}) ENDIF() diff --git a/express/Executor.cpp b/express/Executor.cpp index 572b9d10..cef13be7 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -536,6 +536,7 @@ ErrorCode Executor::ComputeCache::compute() { if (mShapeDirty) { auto code = resize(); if (NO_ERROR != code) { + mShapeDirty = true; return code; } } diff --git a/express/Expr.cpp b/express/Expr.cpp index 201dda10..aca1e571 100644 --- a/express/Expr.cpp +++ b/express/Expr.cpp @@ -116,6 +116,9 @@ Variable::Info* Expr::outputInfo(int index) const { void Expr::_addLinkForInputs(EXPRP expr) { auto inputs = expr->inputs(); for (int i=0; imFrom; for (int j=0; jmTo.size(); ++j) { @@ -290,6 +293,10 @@ bool Expr::requireInfo() { } for (int i = 0; i < mInputs.size(); ++i) { auto& v = mInputs[i]; + if (v->getInfo()->size == 0) { + // zero shape + continue; + } if (mInside->mReq.shapeNeedContent[i]) { // For shape need content, the content must not be nullptr auto ptr = v->readInternal(true); @@ -338,6 +345,9 @@ void Expr::replace(EXPRP old, EXPRP from) { return; } for (auto input : old->inputs()) { + if (input.get() == nullptr) { + continue; + } for (int j=0; jmFrom->mTo.size(); ++j) { auto ref = input->mFrom->mTo[j].lock(); if (ref.get() == old.get()) { @@ -346,6 +356,9 @@ void Expr::replace(EXPRP old, EXPRP from) { } } for (auto input : from->inputs()) { + if (input.get() == nullptr) { + continue; + } bool hasSet = false; for (int j=0; jmFrom->mTo.size(); ++j) { auto ref = input->mFrom->mTo[j].lock(); @@ -567,6 +580,9 @@ void Expr::visit(EXPRP expr, const std::function& before, const std return; } for (int i = 0; i < expr->inputs().size(); ++i) { + if (expr->inputs()[i].get() == nullptr) { + continue; + } visit(expr->inputs()[i]->mFrom, before, after); } after(expr); @@ -721,6 +737,9 @@ void Expr::visitOutputs(const std::function& visit) { bool recurse = false; auto inputs = expr->inputs(); for (int i=0; imFrom.get() == this) { recurse = recurse || visit(expr, i); } @@ -924,6 +943,10 @@ void Variable::save(const std::vector& vars, NetT* dest) { op->name = expr->name(); op->inputIndexes.resize(expr->inputs().size()); for (int i = 0; i < op->inputIndexes.size(); ++i) { + if (expr->inputs()[i] == nullptr) { + op->inputIndexes[i] = -1; + continue; + } auto inputExpr = expr->inputs()[i]->expr(); op->inputIndexes[i] = varIndexInfo[inputExpr.first] + inputExpr.second; } diff --git a/express/MathOp.cpp b/express/MathOp.cpp index a6f83919..db97e0e9 100644 --- a/express/MathOp.cpp +++ b/express/MathOp.cpp @@ -1119,6 +1119,14 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape) { return (Variable::create(Expr::create(std::move(op), {indices, updates, shape}))); } +VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input) { + std::unique_ptr op(new OpT); + op->main.type = OpParameter_NONE; + op->type = OpType_ScatterNd; + op->main.value = nullptr; + return (Variable::create(Expr::create(std::move(op), {indices, updates, shape, input}))); +} + VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis) { std::unique_ptr op(new OpT); op->type = OpType_OneHot; diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp index d45c969c..01f38a97 100644 --- a/express/NeuralNetWorkOp.cpp +++ b/express/NeuralNetWorkOp.cpp @@ -581,6 +581,22 @@ VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided, int32_t begin op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask; return (Variable::create(Expr::create(op.get(), {input, begin, end, strided}))); } + +VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write, int32_t beginMask, + int32_t endMask, int32_t ellipsisMask, int32_t newAxisMask, int32_t shrinkAxisMask) { + std::unique_ptr op(new OpT); + op->type = OpType_StridedSlice; + op->main.type = OpParameter_StridedSliceParam; + op->main.value = new StridedSliceParamT; + + op->main.AsStridedSliceParam()->T = DataType_DT_FLOAT; + op->main.AsStridedSliceParam()->beginMask = beginMask; + op->main.AsStridedSliceParam()->endMask = endMask; + op->main.AsStridedSliceParam()->ellipsisMask = ellipsisMask; + op->main.AsStridedSliceParam()->newAxisMask = newAxisMask; + op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask; + return (Variable::create(Expr::create(op.get(), {input, begin, end, strided, write}))); +} /*Transposes x. Args: x: A variable. @@ -1830,5 +1846,57 @@ VARP _Where(VARP x) { return (Variable::create(Expr::create(std::move(op), {x}))); } +VARP _Sort(VARP x, int axis, bool arg, bool descend) { + std::unique_ptr op(new OpT); + op->type = OpType_TopKV2; + op->main.type = OpParameter_TopKV2; + auto topk = new TopKV2T; + topk->largest = descend; + op->main.value = topk; + auto shape = x->getInfo()->dim; + axis = axis < 0 ? shape.size() + axis : axis; + int k = x->getInfo()->dim[axis]; + std::vector inputs {x, _Scalar(k)}; + if (axis + 1 != shape.size()) { + inputs.push_back(_Scalar(axis)); + } + auto expr = Expr::create(op.get(), inputs, 2); + return Variable::create(expr, arg); +} + +VARP _Raster(const std::vector& vars, const std::vector& region, const std::vector& shape) { + std::unique_ptr op(new MNN::OpT); + op->type = OpType_Raster; + auto extra = new ExtraT; + // set shape + std::unique_ptr shapeAttr(new AttributeT); + shapeAttr->key = "shape"; + shapeAttr->list.reset(new ListValueT); + shapeAttr->list->i = shape; + extra->attr.push_back(std::move(shapeAttr)); + // set region + std::unique_ptr regionAttr(new AttributeT); + regionAttr->key = "region"; + regionAttr->list.reset(new ListValueT); + regionAttr->list->i = region; + extra->attr.push_back(std::move(regionAttr)); + op->main.type = OpParameter_Extra; + op->main.value = extra; + return (Variable::create(Expr::create(std::move(op), vars))); +} + +VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold, float scoreThreshold) { + std::unique_ptr op(new MNN::OpT); + op->type = OpType_NonMaxSuppressionV2; + std::vector vars {boxes, scores, _Scalar(maxDetections)}; + if (iouThreshold >= 0) { + vars.push_back(_Scalar(iouThreshold)); + } + if (scoreThreshold >= 0) { + vars.push_back(_Scalar(scoreThreshold)); + } + return (Variable::create(Expr::create(std::move(op), vars))); +} + } // namespace Express } // namespace MNN diff --git a/express/module/Module.cpp b/express/module/Module.cpp index 17f80b35..af547017 100644 --- a/express/module/Module.cpp +++ b/express/module/Module.cpp @@ -166,7 +166,8 @@ public: return mModule->onForward(inputs); } virtual Module* clone(CloneContext* ctx) const override { - NetModule* module(new NetModule(mModule, mInfo)); + std::shared_ptr submodule(mModule->clone(ctx)); + NetModule* module(new NetModule(submodule, mInfo)); return this->cloneBaseTo(ctx, module); } const Module::Info* info() const { @@ -223,9 +224,9 @@ static void _loadInputs(Module::Info* info, const std::vector& inpu } } -Module* Module::load(const std::vector& inputs, const std::vector& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr rtMgr, const Module::Config* config) { +Module* Module::load(const std::vector& inputs, const std::vector& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr _rtMgr, const Module::Config* config) { // Check if runtime is valid - if (nullptr != rtMgr && rtMgr->getRuntimeInfo().first.empty()) { + if (nullptr != _rtMgr && _rtMgr->getRuntimeInfo().first.empty()) { MNN_ERROR("Invalid runtime\n"); return nullptr; } @@ -269,6 +270,17 @@ Module* Module::load(const std::vector& inputs, const std::vector info(new Info); + auto rtMgr = _rtMgr; + Module::Config defaultConfig; + if (nullptr == config) { + config = &defaultConfig; + } + if(nullptr == rtMgr && config->backend != nullptr) { + ScheduleConfig sche_config; + sche_config.type = config->backend->type; + sche_config.backendConfig = config->backend->config; + rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config)); + } if ((!inputs.empty()) && (!outputs.empty())) { _loadInputs(info.get(), inputs, net); info->runTimeManager = rtMgr; diff --git a/express/module/NMSModule.hpp b/express/module/NMSModule.hpp index 5d5cbf4b..7b0b92af 100644 --- a/express/module/NMSModule.hpp +++ b/express/module/NMSModule.hpp @@ -16,7 +16,7 @@ public: // Do nothing } virtual std::vector onForward(const std::vector& inputs) override; - static NMSModule* create(const Op* op); + MNN_PUBLIC static NMSModule* create(const Op* op); private: NMSModule(){} diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp index 03f3b6b0..4c0af907 100644 --- a/include/MNN/ImageProcess.hpp +++ b/include/MNN/ImageProcess.hpp @@ -61,6 +61,7 @@ public: /** edge wrapper */ Wrap wrap = CLAMP_TO_EDGE; + bool draw = false; }; public: @@ -148,6 +149,18 @@ public: void setPadding(uint8_t value) { mPaddingValue = value; } + /** + * @brief draw color to regions of img. + * @param img the image to draw. + * @param w the image's width. + * @param h the image's height. + * @param c the image's channel. + * @param regions the regions to draw, size is [num * 3] contain num x { y, xl, xr } + * @param num regions num + * @param color the color to draw. + * @return void. + */ + void draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color); private: ImageProcess(const Config& config); Matrix mTransform; diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp index 088ab8da..a30c9d06 100644 --- a/include/MNN/Interpreter.hpp +++ b/include/MNN/Interpreter.hpp @@ -154,7 +154,7 @@ public: * @param keySize depercerate, for future use. */ void setCacheFile(const char* cacheFile, size_t keySize = 128); - + /** * @brief The API shoud be called after last resize session. * If resize session generate new cache info, try to rewrite cache file. @@ -357,6 +357,12 @@ public: */ const char* bizCode() const; + /** + * @brief get model UUID + * @return Model UUID. + */ + const char* uuid() const; + private: static Interpreter* createFromBufferInternal(Content* net); diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp index f3be7ec7..34e731db 100644 --- a/include/MNN/expr/Executor.hpp +++ b/include/MNN/expr/Executor.hpp @@ -70,7 +70,7 @@ public: return mDebug.get(); } struct Cache; - class RuntimeManager { + class MNN_PUBLIC RuntimeManager { public: ~RuntimeManager(); /** diff --git a/include/MNN/expr/MathOp.hpp b/include/MNN/expr/MathOp.hpp index d4b4e93a..7cf9fc0f 100644 --- a/include/MNN/expr/MathOp.hpp +++ b/include/MNN/expr/MathOp.hpp @@ -124,6 +124,7 @@ MNN_PUBLIC VARP _ArgMin(VARP input, int axis = 0); MNN_PUBLIC VARP _BatchMatMul(VARP x, VARP y, bool adj_x = false, bool adj_y = false); MNN_PUBLIC VARP _UnravelIndex(VARP indices, VARP dims); MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape); +MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input); MNN_PUBLIC VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis = -1); MNN_PUBLIC VARP _BroadcastTo(VARP a, VARP shape); MNN_PUBLIC VARP _LinSpace(VARP start, VARP stop, VARP num); diff --git a/include/MNN/expr/NeuralNetWorkOp.hpp b/include/MNN/expr/NeuralNetWorkOp.hpp index 567d3892..d019b851 100644 --- a/include/MNN/expr/NeuralNetWorkOp.hpp +++ b/include/MNN/expr/NeuralNetWorkOp.hpp @@ -63,8 +63,11 @@ MNN_PUBLIC VARP _Softsign(VARP features); MNN_PUBLIC std::vector _Split(VARP value, INTS size_splits, int axis = 0); MNN_PUBLIC VARP _Slice(VARP x, VARP starts, VARP sizes); MNN_PUBLIC VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided, - int32_t beginMask, int32_t endMask, int32_t ellipsisMask, - int32_t newAxisMask, int32_t shrinkAxisMask); + int32_t beginMask, int32_t endMask, int32_t ellipsisMask, + int32_t newAxisMask, int32_t shrinkAxisMask); +MNN_PUBLIC VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write, + int32_t beginMask, int32_t endMask, int32_t ellipsisMask, + int32_t newAxisMask, int32_t shrinkAxisMask); MNN_PUBLIC VARP _Concat(VARPS values, int axis); MNN_PUBLIC VARP _Convert(VARP input, Dimensionformat format); MNN_PUBLIC VARP _Transpose(VARP x, INTS perm); @@ -155,6 +158,9 @@ MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1); MNN_PUBLIC std::vector _TopKV2(VARP input0, VARP input1); MNN_PUBLIC VARP _ImageProcess(VARP input, CV::ImageProcess::Config config, CV::Matrix matrix, int oh, int ow, int oc, int dtype, uint8_t padVal = 0); MNN_PUBLIC VARP _Where(VARP x); +MNN_PUBLIC VARP _Sort(VARP x, int axis = -1, bool arg = false, bool descend = false); +MNN_PUBLIC VARP _Raster(const std::vector& vars, const std::vector& regions, const std::vector& shape); +MNN_PUBLIC VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold = -1, float scoreThreshold = -1); } // namespace Express } // namespace MNN diff --git a/package_scripts/linux/build_whl.sh b/package_scripts/linux/build_whl.sh index 1157cb48..ae3c04b7 100755 --- a/package_scripts/linux/build_whl.sh +++ b/package_scripts/linux/build_whl.sh @@ -21,13 +21,13 @@ done rm -rf $path && mkdir -p $path PACKAGE_PATH=$(realpath $path) -CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON" +CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON" if [ ! -z $opencl ]; then CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON" fi rm -rf pymnn_build && mkdir pymnn_build pushd pymnn_build -cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24 +cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j24 popd pushd pymnn/pip_package diff --git a/package_scripts/mac/build_whl.sh b/package_scripts/mac/build_whl.sh index 82356967..a24552a1 100755 --- a/package_scripts/mac/build_whl.sh +++ b/package_scripts/mac/build_whl.sh @@ -19,25 +19,27 @@ while getopts "o:p:v:b" opt; do esac done +export MACOSX_DEPLOYMENT_TARGET=10.11 + ./schema/generate.sh rm -rf $path && mkdir -p $path PACKAGE_PATH=$(realpath $path) -CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON" +CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON" if [ ! -z $opencl ]; then CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON" fi rm -rf pymnn_build && mkdir pymnn_build pushd pymnn_build -cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8 +cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j8 popd pushd pymnn/pip_package echo -e "__version__ = '$mnn_version'" > MNN/version.py rm -rf build && mkdir build rm -rf dist && mkdir dist -if [ -z $python_versions ]; then +if [ -z "$python_versions" ]; then python build_wheel.py --version $mnn_version else for env in $python_versions; do diff --git a/package_scripts/win/build_bridge.ps1 b/package_scripts/win/build_bridge.ps1 index ef32b7f7..465efc7f 100644 --- a/package_scripts/win/build_bridge.ps1 +++ b/package_scripts/win/build_bridge.ps1 @@ -1,66 +1,63 @@ # MNNPyBridge -# |-- Debug -# | |--- MD -# | |--- MT -# | |--- Static -# | -# |-- Release -# |--- MD -# |--- MT -# |--- Static +# |-- include +# |-- wrapper +# |-- test (Release + Dynamic + MD) +# |-- x64 +# |-- x86 +# |-- lib +# |-- x64 +# | |-- (Debug/Release x Dynamic/Static x MD/MT) +# | +# |-- x86 +# |-- (Debug/Release x Dynamic/Static x MD/MT) Param( [Parameter(Mandatory=$true)][String]$version, [Parameter(Mandatory=$true)][String]$pyc_env, [Parameter(Mandatory=$true)][String]$mnn_path, + [Parameter(Mandatory=$true)][String]$python_path, + [Parameter(Mandatory=$true)][String]$numpy_path, [Parameter(Mandatory=$true)][String]$path, + [Switch]$train_api, [Switch]$x86 ) -# build process may failed because of lnk1181, but be success when run again -# Run expr, return if success, otherwise try again until try_times -function Retry([String]$expr, [Int]$try_times) { - $cnt = 0 - do { - $cnt++ - try { - Invoke-Expression $expr - return - } catch { } - } while($cnt -lt $try_times) - throw "Failed: $expr" +# build it according to cmake_cmd, exit 1 when any error occur +function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") { + Invoke-Expression $cmake_cmd + # build process may failed because of lnk1181, but be success when run again + $try_times = 2 + if ($LastExitCode -eq 0) { + For ($cnt = 0; $cnt -lt $try_times; $cnt++) { + try { + Invoke-Expression $ninja_cmd + if ($LastExitCode -eq 0) { + return + } + } catch {} + } + } + popd + exit 1 } $erroractionpreference = "stop" +mkdir -p $path -ErrorAction Ignore $PACKAGE_PATH = $(Resolve-Path $path).Path -$PACKAGE_LIB_PATH = "$PACKAGE_PATH\lib" -if ($x86) { - $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x86" -} else { - $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x64" -} -$MNN_PACKAGE_PATH = $(Resolve-Path $mnn_path).Path - -pushd pymnn\3rd_party -Remove-Item MNN -Recurse -ErrorAction Ignore -mkdir -p MNN\lib -cp -r $MNN_PACKAGE_PATH\* MNN\lib -cp -r ..\..\include MNN -popd +$arch = $(If($x86) {"x86"} Else {"x64"}) +$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$arch" +$TEST_TOOL_PATH = "$PACKAGE_PATH/test/$arch" #clear and create package directory powershell ./schema/generate.ps1 pushd $PACKAGE_PATH -Remove-Item include -Recurse -ErrorAction Ignore -Remove-Item wrapper -Recurse -ErrorAction Ignore -mkdir -p include -mkdir -p wrapper -mkdir -p $PACKAGE_LIB_PATH\Debug\MD -ErrorAction SilentlyContinue -mkdir -p $PACKAGE_LIB_PATH\Debug\MT -ErrorAction SilentlyContinue -mkdir -p $PACKAGE_LIB_PATH\Debug\Static -ErrorAction SilentlyContinue -mkdir -p $PACKAGE_LIB_PATH\Release\MD -ErrorAction SilentlyContinue -mkdir -p $PACKAGE_LIB_PATH\Release\MT -ErrorAction SilentlyContinue -mkdir -p $PACKAGE_LIB_PATH\Release\Static -ErrorAction SilentlyContinue +Remove-Item -Path include, wrapper -Recurse -ErrorAction Ignore +mkdir -p include, wrapper +popd +Remove-Item -Path $PACKAGE_LIB_PATH, $TEST_TOOL_PATH -Recurse -ErrorAction Ignore +mkdir -p $PACKAGE_LIB_PATH, $TEST_TOOL_PATH +pushd $PACKAGE_LIB_PATH +mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT popd # assume $PACKAGE_PATH exist @@ -71,8 +68,16 @@ cp -r pymnn\pip_package\MNN pymnn_pyc_tmp pushd pymnn_pyc_tmp Remove-Item MNN -Include __pycache__ -Recurse pushd MNN -rm -r -force tools -(Get-Content __init__.py).replace('from . import tools', '') | Set-Content __init__.py +function Remove([String]$module) { + rm -r -force $module + (Get-Content __init__.py).replace("from . import $module", "") | Set-Content __init__.py +} +Remove "tools" +if (!$train_api) { + Remove "data" + Remove "optim" +} + popd popd conda activate $pyc_env @@ -83,59 +88,108 @@ Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'" cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force rm -r -force pymnn_pyc_tmp -$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_TRAIN_API=ON" +$mnn_path = $(Resolve-Path $mnn_path).Path +$python_path = $(Resolve-Path $python_path).Path +$numpy_path = $(Resolve-Path $numpy_path).Path + +$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_BUILD_TEST=OFF" +if ($train_api) { + $CMAKE_ARGS = "$CMAKE_ARGS -DPYMNN_TRAIN_API=ON" +} +$CMAKE_ARGS = "$CMAKE_ARGS -Dmnn_path=$mnn_path -Dpython_path=$python_path -Dnumpy_path=$numpy_path" Remove-Item pymnn_build -Recurse -ErrorAction Ignore mkdir pymnn_build pushd pymnn_build -##### Debug/MT #### -#Remove-Item CMakeCache.txt -ErrorAction Ignore -#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn" -#Retry "ninja" 2 -#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MT -#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MT -#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT -#rm mnnpybridge.* +function exist([String]$build_type, [String]$lib_type, [String]$crt_type) { + function _exist([String]$lib) { + $lib_dir = "$lib/lib/$arch/$build_type/$lib_type/$crt_type" + return $((Test-Path -Path $lib_dir) -and ((Get-ChildItem -Path "$lib_dir/*" -Include "*.lib").Count -ne 0)) + } + return $((_exist $mnn_path) -and (_exist $python_path) -and (_exist $numpy_path)) +} -##### Debug/MD #### -#Remove-Item CMakeCache.txt -ErrorAction Ignore -#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF ../pymnn" -#Retry "ninja" 2 -#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MD -#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MD -#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD -#rm mnnpybridge.* +function log([String]$msg) { + echo "================================" + echo "Build MNNPyBridge $msg" + echo "================================" +} -##### Debug/Static #### -#Remove-Item CMakeCache.txt -ErrorAction Ignore -#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" -#Retry "ninja" 2 -#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static -#rm mnnpybridge.* +##### Debug/Dynamic/MT #### +if (exist Debug Dynamic MT) { + log "Debug/Dynamic/MT" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn" + cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT + rm mnnpybridge.* +} -##### Release/MT #### -#Remove-Item CMakeCache.txt -ErrorAction Ignore -#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn" -#Retry "ninja" 2 -#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MT -#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MT -#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MT -#rm mnnpybridge.* +##### Debug/Dynamic/MD #### +if (exist Debug Dynamic MD) { + log "Debug/Dynamic/MD" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug ../pymnn" + cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD + rm mnnpybridge.* +} -##### Release/MD #### -Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF ../pymnn" -Retry "ninja" 2 -cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MD -cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MD -cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MD -rm mnnpybridge.* +##### Debug/Static/MT #### +if (exist Debug Static MT) { + log "Debug/Static/MT" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" + cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MT + rm mnnpybridge.* +} -##### Release/Static #### -#Remove-Item CMakeCache.txt -ErrorAction Ignore -#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" -#Retry "ninja" 2 -#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static +##### Debug/Static/MD #### +if (exist Debug Static MD) { + log "Debug/Static/MD" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" + cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MD + rm mnnpybridge.* +} + +##### Release/Dynamic/MT #### +if (exist Release Dynamic MT) { + log "Release + MT" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn" + cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT + rm mnnpybridge.* +} + +##### Release/Dynamic/MD #### +if (exist Release Dynamic MD) { + log "Release + MD" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release ../pymnn" + cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD + #cp mnnpybridge_test.exe $TEST_TOOL_PATH + #cp $mnn_path/lib/$arch/Release/MD/MNN.dll $TEST_TOOL_PATH + #cp $python_path/lib/$arch/Release/MD/python.dll $TEST_TOOL_PATH + #cp $numpy_path/lib/$arch/Release/MD/numpy_python.dll $TEST_TOOL_PATH + rm mnnpybridge.* +} + +##### Release/Static/MT #### +if (exist Release Static MT) { + log "Release/Static/MT" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" + cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MT + rm mnnpybridge.* +} + +##### Release/Static/MD #### +if (exist Release Static MD) { + log "Release/Static/MD" + Remove-Item CMakeCache.txt -ErrorAction Ignore + Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn" + cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MD + rm mnnpybridge.* +} popd \ No newline at end of file diff --git a/package_scripts/win/build_lib.ps1 b/package_scripts/win/build_lib.ps1 index a899ee1c..ae24da31 100644 --- a/package_scripts/win/build_lib.ps1 +++ b/package_scripts/win/build_lib.ps1 @@ -1,49 +1,47 @@ # MNN -# |-- Debug -# | |--- MD -# | |--- MT -# | |--- Static -# | -# |-- Release -# |--- MD -# |--- MT -# |--- Static +# |-- include +# |-- lib +# |-- Debug +# | |--- Dynamic +# | | |--- MD +# | | |--- MT +# | | +# | |--- Static +# | |--- MD +# | |--- MT +# | +# |-- Release +# |--- Dynamic +# | |--- MD +# | |--- MT +# | +# |--- Static +# |--- MD +# |--- MT +# Param( [Parameter(Mandatory=$true)][String]$path, - [String]$backends + [String]$backends, + [Switch]$x86 ) -# build process may failed because of lnk1181, but be success when run again -# Run expr, return if success, otherwise try again until try_times -function Retry([String]$expr, [Int]$try_times) { - $cnt = 0 - do { - $cnt++ - try { - Invoke-Expression $expr - return - } catch { } - } while($cnt -lt $try_times) - throw "Failed: $expr" -} - $erroractionpreference = "stop" -Remove-Item $path -Recurse -ErrorAction Ignore -mkdir -p $path +New-Item -Path $path -ItemType Directory -ErrorAction Ignore $PACKAGE_PATH = $(Resolve-Path $path).Path +$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$(If ($x86) {"x86"} Else {"x64"})" +Remove-Item -Path $PACKAGE_LIB_PATH -Recurse -ErrorAction Ignore +mkdir -p $PACKAGE_LIB_PATH #clear and create package directory powershell ./schema/generate.ps1 -pushd $PACKAGE_PATH -mkdir -p Debug\MD -mkdir -p Debug\MT -mkdir -p Debug\Static -mkdir -p Release\MD -mkdir -p Release\MT -mkdir -p Release\Static +Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore +cp -r include $PACKAGE_PATH +cp -r tools/cv/include/cv $PACKAGE_PATH/include +pushd $PACKAGE_LIB_PATH +mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT popd -$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON" +$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON" if ($backends -ne $null) { Foreach ($backend in $backends.Split(",")) { if ($backend -eq "opencl") { @@ -58,53 +56,83 @@ Remove-Item build -Recurse -ErrorAction Ignore mkdir build pushd build -##### Debug/MT #### +function log([String]$msg) { + echo "================================" + echo "Build MNN (CPU $backends) $msg" + echo "================================" +} + +# build it according to cmake_cmd, exit 1 when any error occur +function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja MNN") { + Invoke-Expression $cmake_cmd + # build process may failed because of lnk1181, but be success when run again + $try_times = 2 + if ($LastExitCode -eq 0) { + For ($cnt = 0; $cnt -lt $try_times; $cnt++) { + try { + Invoke-Expression $ninja_cmd + if ($LastExitCode -eq 0) { + return + } + } catch {} + } + } + popd + exit 1 +} + +##### Debug/Dynamic/MT #### +log "Debug/Dynamic/MT" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Debug\MT -cp MNN.dll $PACKAGE_PATH\Debug\MT -cp MNN.pdb $PACKAGE_PATH\Debug\MT +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .." +cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MT rm MNN.* -##### Debug/MD #### +##### Debug/Dynamic/MD #### +log "Debug/Dynamic/MD" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Debug\MD -cp MNN.dll $PACKAGE_PATH\Debug\MD -cp MNN.pdb $PACKAGE_PATH\Debug\MD +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .." +cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MD rm MNN.* -##### Debug/Static #### +##### Debug/Static/MT #### +log "Debug/Static/MT" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Debug\Static +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .." +cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MT rm MNN.* -##### Release/MT #### +##### Debug/Static/MD #### +log "Debug/Static/MD" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Release\MT -cp MNN.dll $PACKAGE_PATH\Release\MT -cp MNN.pdb $PACKAGE_PATH\Release\MT +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .." +cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MD rm MNN.* -##### Release/MD #### +##### Release/Dynamic/MT #### +log "Release/Dynamic/MT" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Release\MD -cp MNN.dll $PACKAGE_PATH\Release\MD -cp MNN.pdb $PACKAGE_PATH\Release\MD +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .." +cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT rm MNN.* -##### Release/Static #### +##### Release/Dynamic/MD #### +log "Release/Dynamic/MD" Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .." -Retry "ninja" 2 -cp MNN.lib $PACKAGE_PATH\Release\Static +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .." +cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD +rm MNN.* + +##### Release/Static/MT #### +log "Release/Static/MT" +Remove-Item CMakeCache.txt -ErrorAction Ignore +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .." +cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MT + +##### Release/Static/MD #### +log "Release/Static/MD" +Remove-Item CMakeCache.txt -ErrorAction Ignore +Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .." +cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MD popd \ No newline at end of file diff --git a/package_scripts/win/build_tools.ps1 b/package_scripts/win/build_tools.ps1 index d480cf17..5e3cd38c 100644 --- a/package_scripts/win/build_tools.ps1 +++ b/package_scripts/win/build_tools.ps1 @@ -1,5 +1,6 @@ Param( [Parameter(Mandatory=$true)][String]$path, + [Switch]$dynamic_link, [String]$backends, [Switch]$build_all, [Switch]$build_train, # MNN_BUILD_TRAIN @@ -23,20 +24,6 @@ if ($build_all) { $build_demo = $true } -# build process may failed because of lnk1181, but be success when run again -# Run expr, return if success, otherwise try again until try_times -function Retry([String]$expr, [Int]$try_times) { - $cnt = 0 - do { - $cnt++ - try { - Invoke-Expression $expr - return - } catch { } - } while($cnt -lt $try_times) - throw "Failed: $expr" -} - $erroractionpreference = "stop" Remove-Item $path -Recurse -ErrorAction Ignore mkdir -p $path @@ -44,7 +31,12 @@ $TOOLS_PATH = $(Resolve-Path $path).Path powershell ./schema/generate.ps1 -$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF" +$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON" +if ($dynamic_link) { + $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=ON" +} else { + $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON" +} if ($build_train) { $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_TRAIN=ON" } @@ -59,6 +51,11 @@ if ($build_evaluation) { } if ($build_converter) { $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_CONVERTER=ON" + if ($dynamic_link) { + $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=ON" + } else { + $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=OFF" + } } if ($build_benchmark) { $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_BENCHMARK=ON" @@ -83,37 +80,37 @@ Remove-Item build -Recurse -ErrorAction Ignore mkdir build pushd build +# build it according to cmake_cmd, exit 1 when any error occur +function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") { + Invoke-Expression $cmake_cmd + # build process may failed because of lnk1181, but be success when run again + $try_times = 2 + if ($LastExitCode -eq 0) { + For ($cnt = 0; $cnt -lt $try_times; $cnt++) { + try { + Invoke-Expression $ninja_cmd + if ($LastExitCode -eq 0) { + return + } + } catch {} + } + } + popd + exit 1 +} + Remove-Item CMakeCache.txt -ErrorAction Ignore -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .." -Retry "ninja" 2 +Build "cmake -G Ninja $CMAKE_ARGS .." -$PRODUCTS = "" -if ($build_train) { - $PRODUCTS = "$PRODUCTS transformer.out.exe train.out.exe rawDataTransform.out.exe dataTransformer.out.exe runTrainDemo.out.exe" -} -if ($build_tools) { - $PRODUCTS = "$PRODUCTS MNNV2Basic.out.exe mobilenetTest.out.exe backendTest.out.exe testModel.out.exe testModelWithDescrisbe.out.exe getPerformance.out.exe checkInvalidValue.out.exe timeProfile.out.exe" -} -if ($build_quantools) { - $PRODUCTS = "$PRODUCTS quantized.out.exe quantized_model_optimize.out.exe" -} -if ($build_evaluation) { - $PRODUCTS = "$PRODUCTS classficationTopkEval.out.exe" -} -if ($build_converter) { - $PRODUCTS = "$PRODUCTS MNNDump2Json.exe MNNConvert.exe" -} -if ($build_benchmark) { - $PRODUCTS = "$PRODUCTS benchmark.out.exe benchmarkExprModels.out.exe" -} -if ($build_test) { - $PRODUCTS = "$PRODUCTS run_test.out.exe" -} -if ($build_demo) { - $PRODUCTS = "$PRODUCTS pictureRecognition.out.exe pictureRotate.out.exe multiPose.out.exe segment.out.exe expressDemo.out.exe transformerDemo.out.exe rasterDemo.out.exe" +$PRODUCTS = $(Get-ChildItem -Path . -Include "*.exe" -Name) +if ($dynamic_link) { + $PRODUCTS = "$PRODUCTS MNN.dll" + if ($build_converter) { + $PRODUCTS = "$PRODUCTS ./3rd_party/protobuf/cmake/libprotobuf.dll" + } } -Foreach ($PRODUCT in $PRODUCTS.Split(" ")) { +Foreach ($PRODUCT in $PRODUCTS.Trim().Split()) { Invoke-Expression "cp $PRODUCT $TOOLS_PATH" } diff --git a/package_scripts/win/build_whl.ps1 b/package_scripts/win/build_whl.ps1 index ec2f5fc3..4ed10229 100644 --- a/package_scripts/win/build_whl.ps1 +++ b/package_scripts/win/build_whl.ps1 @@ -6,25 +6,28 @@ Param( [Switch]$x86 ) -# build process may failed because of lnk1181, but be success when run again -# Run expr, return if success, otherwise try again until try_times -function Retry([String]$expr, [Int]$try_times) { - $cnt = 0 - do { - $cnt++ - try { - Invoke-Expression $expr - return - } catch { } - } while($cnt -lt $try_times) - throw "Failed: $expr" +# build it according to cmake_cmd, exit 1 when any error occur +function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") { + Invoke-Expression $cmake_cmd + # build process may failed because of lnk1181, but be success when run again + $try_times = 2 + if ($LastExitCode -eq 0) { + For ($cnt = 0; $cnt -lt $try_times; $cnt++) { + try { + Invoke-Expression $ninja_cmd + if ($LastExitCode -eq 0) { + return + } + } catch {} + } + } + exit 1 } $erroractionpreference = "stop" $python_versions = $pyenvs.Split(",") -Remove-Item $path -Recurse -ErrorAction Ignore -mkdir -p $path +New-Item -Path $path -ItemType Directory -ErrorAction Ignore $PACKAGE_PATH = $(Resolve-Path $path).Path $ARGS = "--version $version" if ($x86) { @@ -37,7 +40,7 @@ powershell ./schema/generate.ps1 Remove-Item pymnn_build -Recurse -ErrorAction Ignore mkdir pymnn_build pushd pymnn_build -$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON " +$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON" if ($backends -ne $null) { Foreach($backend in $backends.Split(",")) { if ($backend -eq "opencl") { @@ -47,8 +50,7 @@ if ($backends -ne $null) { } } } -Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .." -Retry "ninja MNN MNNTrain MNNConvert" 2 +Build "cmake -G Ninja $CMAKE_ARGS .." "ninja MNN MNNTrain MNNConvert MNNOpenCV" popd pushd pymnn/pip_package @@ -59,12 +61,15 @@ mkdir dist mkdir build if ($pyenvs -eq $null) { - Retry "python build_wheel.py $ARGS" 2 + Invoke-Expression "python build_wheel.py $ARGS" } else { Foreach ($env in $pyenvs.Split(",")) { Invoke-Expression "conda activate $env" - Retry "python build_wheel.py $ARGS" 2 - Invoke-Expression "conda deactivate" + Invoke-Expression "python build_wheel.py $ARGS" + conda deactivate + if ($LastExitCode -ne 0) { + exit 1 + } } } diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj index e7c6a477..69882388 100644 --- a/project/ios/MNN.xcodeproj/project.pbxproj +++ b/project/ios/MNN.xcodeproj/project.pbxproj @@ -748,6 +748,7 @@ EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; }; EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; }; EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; + F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */ = {isa = PBXBuildFile; fileRef = F41497D6278D8A21004A363A /* RuntimeAttr.hpp */; }; F4FB5AD7274E6CC100EAF0C1 /* MNNAESCipher.h in Headers */ = {isa = PBXBuildFile; fileRef = F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */; }; F4FB5AD8274E6CC100EAF0C1 /* ModelAuth.mm in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */; }; F4FB5AD9274E6CC100EAF0C1 /* MNNAESCipher.m in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */; }; @@ -1542,6 +1543,7 @@ EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = ""; }; EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = ""; }; EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = ""; }; + F41497D6278D8A21004A363A /* RuntimeAttr.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = RuntimeAttr.hpp; sourceTree = ""; }; F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAESCipher.h; sourceTree = ""; }; F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ModelAuth.mm; sourceTree = ""; }; F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = MNNAESCipher.m; sourceTree = ""; }; @@ -1679,6 +1681,7 @@ 48593FB423A89B2F0069452A /* express */ = { isa = PBXGroup; children = ( + F41497D6278D8A21004A363A /* RuntimeAttr.hpp */, 489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */, 48C84B6F250F711600EE7666 /* module */, 48FA474C23AA136300172C3B /* MergeOptimizer.cpp */, @@ -2951,6 +2954,7 @@ 92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */, 4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */, 48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */, + F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */, 92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */, 489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */, 92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */, @@ -2985,6 +2989,7 @@ buildConfigurationList = 0F1465BF1FA18D1000F9860A /* Build configuration list for PBXNativeTarget "MNN" */; buildPhases = ( 0F1465B41FA18D1000F9860A /* Headers */, + F48DED4627742886004B8DB0 /* ShellScript */, 0F1465B21FA18D1000F9860A /* Sources */, 0F1465B31FA18D1000F9860A /* Frameworks */, 0F1465B51FA18D1000F9860A /* Resources */, @@ -3091,6 +3096,23 @@ shellPath = /bin/sh; shellScript = "\necho \"==========\"\necho ${TARGET_NAME}\necho ${PROJECT_FILE_PATH}\necho ${TARGET_BUILD_DIR}\n\ntouch ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib\ncp ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib ${TARGET_BUILD_DIR}/Playground.app/\n"; }; + F48DED4627742886004B8DB0 /* ShellScript */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputFileListPaths = ( + ); + inputPaths = ( + ); + outputFileListPaths = ( + ); + outputPaths = ( + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "# Type a script or drag a script file from your workspace to insert its path.\nMNN_REVISION=`git rev-parse HEAD`\necho \"#define MNN_REVISION \\\"${MNN_REVISION}\\\"\" > ${SRCROOT}/../../include/MNN/VCS.h\n"; + }; /* End PBXShellScriptBuildPhase section */ /* Begin PBXSourcesBuildPhase section */ @@ -3808,7 +3830,7 @@ CODE_SIGN_STYLE = Automatic; DEAD_CODE_STRIPPING = YES; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = 6T3QR3X696; + DEVELOPMENT_TEAM = UMNWSVYR5X; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -3854,7 +3876,7 @@ METAL_LIBRARY_FILE_BASE = mnn; ONLY_ACTIVE_ARCH = YES; OTHER_CFLAGS = ""; - PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds; + PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PROVISIONING_PROFILE_SPECIFIER = ""; "PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = ""; @@ -3875,7 +3897,7 @@ CODE_SIGN_STYLE = Automatic; DEAD_CODE_STRIPPING = YES; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = 6G7464HHUS; + DEVELOPMENT_TEAM = UMNWSVYR5X; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -3919,7 +3941,7 @@ MACH_O_TYPE = staticlib; METAL_LIBRARY_FILE_BASE = mnn; OTHER_CFLAGS = ""; - PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds; + PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PROVISIONING_PROFILE_SPECIFIER = ""; "PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = ""; @@ -3938,7 +3960,7 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage; CODE_SIGN_STYLE = Automatic; - DEVELOPMENT_TEAM = 6G7464HHUS; + DEVELOPMENT_TEAM = UMNWSVYR5X; GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; HEADER_SEARCH_PATHS = ( @@ -3963,7 +3985,7 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage; CODE_SIGN_STYLE = Automatic; - DEVELOPMENT_TEAM = 6G7464HHUS; + DEVELOPMENT_TEAM = UMNWSVYR5X; GCC_ENABLE_CPP_EXCEPTIONS = NO; GCC_ENABLE_CPP_RTTI = NO; HEADER_SEARCH_PATHS = ( diff --git a/project/ios/Playground/AppDelegate.mm b/project/ios/Playground/AppDelegate.mm index b0b37695..f01ffb6e 100644 --- a/project/ios/Playground/AppDelegate.mm +++ b/project/ios/Playground/AppDelegate.mm @@ -9,37 +9,50 @@ #import "AppDelegate.h" #import "MNNTestSuite.h" #include +#include #import #import "benchmark.h" @implementation AppDelegate - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { -#define UNITTEST -#ifdef UNITTEST - // unittest - { - MNN::BackendConfig config; - // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL - MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1); - int precisionInTestUtil = - getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16)); - MNNTestSuite::runAll(precisionInTestUtil); - } -#endif -#ifdef BENCHMARK - // benchmark - { - auto bundle = CFBundleGetMainBundle(); - auto url = CFBundleCopyBundleURL(bundle); - auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle); - CFRelease(url); - auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); - auto res = std::string(cstring) + "/models"; - CFRelease(string); - iosBenchAll(res.c_str()); - } -#endif +//#define UNITTEST +//#ifdef UNITTEST +// // unittest +// { +// MNN::BackendConfig config; +// // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL +// MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1); +// int precisionInTestUtil = +// getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16)); +// MNNTestSuite::runAll(precisionInTestUtil); +// } +//#endif +//#ifdef BENCHMARK +// // benchmark +// { +// auto bundle = CFBundleGetMainBundle(); +// auto url = CFBundleCopyBundleURL(bundle); +// auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle); +// CFRelease(url); +// auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); +// auto res = std::string(cstring) + "/models"; +// CFRelease(string); +// iosBenchAll(res.c_str()); +// } +//#endif + auto bundle = CFBundleGetMainBundle(); + auto url = CFBundleCopyBundleURL(bundle); + auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle); + CFRelease(url); + auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8); + auto res = std::string(cstring) + "/models/mobilenet_v2_auth.mnn"; + + + MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str()); + MNN::ScheduleConfig config; + interpreter->createSession(config); + return YES; } diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt index daf811c6..95bfbc1b 100644 --- a/pymnn/CMakeLists.txt +++ b/pymnn/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.4.1) project(mnnpybridge) +# python_path / numpy_path / mnn_path option(DEPEND_AAPL_FMWK "use dependency library .framework instead of traditional .a/.dylib" OFF) option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON) option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF) @@ -12,8 +13,17 @@ option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OF option(PYMNN_EXPR_API "MNN expr API be exposed" ON) option(PYMNN_NUMPY_USABLE "Build based on numpy" ON) option(PYMNN_TRAIN_API "MNN train API be exposed" OFF) +option(PYMNN_INTERNAL_SERVING "Internal use only." OFF) + +if(PYMNN_INTERNAL_SERVING) + file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc + ${CMAKE_CURRENT_LIST_DIR}/src/internal/monitor_service.cc + ${CMAKE_CURRENT_LIST_DIR}/src/internal/verify_service.cc + ${CMAKE_CURRENT_LIST_DIR}/src/internal/http_util.cc) +else() + file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc) +endif() -file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc) if (MNN_BUILD_SHARED_LIBS) add_library(mnnpybridge SHARED ${SRC}) else() @@ -39,6 +49,11 @@ if(PYMNN_TRAIN_API) target_compile_definitions(mnnpybridge PRIVATE PYMNN_TRAIN_API) endif() +if(PYMNN_INTERNAL_SERVING) + message(STATUS "mnnpybridge define PYMNN_INTERNAL_SERVING") + target_compile_definitions(mnnpybridge PRIVATE PYMNN_INTERNAL_SERVING) +endif() + if(CMAKE_SYSTEM_NAME MATCHES "^Android") add_definitions(-DMNN_USE_LOGCAT) endif() @@ -59,8 +74,8 @@ if(MSVC) endif() endif () endforeach() - set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267") - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267 /experimental:preprocessor") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267 /experimental:preprocessor") SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -73,20 +88,24 @@ endif() if(PYMNN_TRAIN_API) set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..) target_include_directories(mnnpybridge PRIVATE - ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer - ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include) + ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer ${MNN_DIR}/tools/train/source/nn + ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include ${MNN_DIR}/tools/cv/include + ${MNN_DIR}/express ${MNN_DIR}/express/module ${MNN_DIR}/tools) endif() if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux") set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party") set(LIB_SUBPATH "") if(WIN32) - if(NOT MNN_BUILD_SHARED_LIBS) - set(LIB_SUBPATH "Static") - elseif(MNN_WIN_RUNTIME_MT) - set(LIB_SUBPATH "MT") + if (MNN_BUILD_SHARED_LIBS) + set(LIB_SUBPATH "Dynamic") else() - set(LIB_SUBPATH "MD") + set(LIB_SUBPATH "Static") + endif() + if (MNN_WIN_RUNTIME_MT) + set(LIB_SUBPATH "${LIB_SUBPATH}/MT") + else() + set(LIB_SUBPATH "${LIB_SUBPATH}/MD") endif() elseif(APPLE) if(MNN_BUILD_SHARED_LIBS) @@ -108,34 +127,23 @@ if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux") endif() endif() - target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include) - target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}) - if(APPLE AND DEPEND_AAPL_FMWK) - target_link_libraries(mnnpybridge PRIVATE "-framework MNN") - set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}") - else() - target_link_libraries(mnnpybridge PRIVATE MNN) + find_library(MNN NAMES MNN REQUIRED PATHS ${mnn_path}/lib/${LIB_SUBPATH}) + if(NOT DEPEND_AAPL_FMWK) + target_include_directories(mnnpybridge PUBLIC ${mnn_path}/include) endif() + target_link_libraries(mnnpybridge PUBLIC ${MNN}) - if(PYMNN_USE_ALINNPYTHON) - target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include) - target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}) - if(APPLE AND DEPEND_AAPL_FMWK) - target_link_libraries(mnnpybridge PRIVATE "-framework python") - set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}") - else() - target_link_libraries(mnnpybridge PRIVATE python) - endif() + find_library(python NAMES python REQUIRED PATHS ${python_path}/lib/${LIB_SUBPATH}) + if(NOT DEPEND_AAPL_FMWK) + target_include_directories(mnnpybridge PUBLIC ${python_path}/include) endif() + target_link_libraries(mnnpybridge PUBLIC ${python}) if(PYMNN_NUMPY_USABLE) - target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include) - target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}) - if(APPLE AND DEPEND_AAPL_FMWK) - target_link_libraries(mnnpybridge PRIVATE "-framework numpy_python") - set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}") - else() - target_link_libraries(mnnpybridge PRIVATE numpy_python) + find_library(numpy NAMES numpy_python REQUIRED PATHS ${numpy_path}/lib/${LIB_SUBPATH}) + if(NOT DEPEND_AAPL_FMWK) + target_include_directories(mnnpybridge PUBLIC ${numpy_path}/include) endif() + target_link_libraries(mnnpybridge PUBLIC ${numpy}) endif() else() target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include) diff --git a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py index bd1e5acc..067cbca9 100644 --- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py +++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py @@ -13,7 +13,7 @@ def inference(): config['precision'] = 'low' session = interpreter.createSession() input_tensor = interpreter.getSessionInput(session) - image = cv2.imread('ILSVRC2012_val_00049999.JPEG') + image = cv2.imread('0000.jpg') #cv2 read as bgr format image = image[..., ::-1] #change to rgb format diff --git a/pymnn/pip_package/MNN/cv/__init__.py b/pymnn/pip_package/MNN/cv/__init__.py index 219abdfd..0eb2a512 100644 --- a/pymnn/pip_package/MNN/cv/__init__.py +++ b/pymnn/pip_package/MNN/cv/__init__.py @@ -1,11 +1,22 @@ from _mnncengine.cv import * +import _mnncengine.cv as _F import MNN.numpy as _np +import MNN def __to_int(x): dtype = x.dtype if dtype == _np.int32: return x return x.astype(_np.int32) +def resize(src, dsize=None, fx=None, fy=None, interpolation=INTER_LINEAR, code = None, mean=[], norm=[]): + if dsize is None and fx is None and fy is None: + raise ValueError('reisze must set dsize or fx,fy.') + if dsize is None: dsize = [0, 0] + if fx is None: fx = 0 + if fy is None: fy = 0 + if code is None: code = -1 + else: code = hash(code) + return _F.resize(src, dsize, fx, fy, interpolation, code, mean, norm) def copyTo(src, mask=None, dst=None): if mask is None: return src.copy() origin_dtype = src.dtype @@ -45,3 +56,33 @@ def hconcat(src): return _np.concatenate(src, 1) def vconcat(src): return _np.concatenate(src, 0) +def mean(src, mask=None): + if mask is not None: + src = copyTo(src, mask) + res = _np.mean(src, [0, 1]) + if res.ndim == 0: size = 0 + else: size = res.shape[0] + if size < 4: + res = _np.pad(res, [0, 4 - size]) + return res +def flip(src, flipCode): + h, w, c = src.shape + m = MNN.CVMatrix() + if flipCode < 0: + m.write([-1., 0., w-1., 0., -1., h-1.]) + elif flipCode == 0: + m.write([1., 0., 0., 0., -1., h-1.]) + else: + m.write([-1., 0., w-1., 0., 1., 0.]) + return warpAffine(src, m, [w, h]) +ROTATE_90_CLOCKWISE = 0 +ROTATE_180 = 1 +ROTATE_90_COUNTERCLOCKWISE = 2 +def rotate(src, rotateMode): + if rotateMode == ROTATE_90_CLOCKWISE: + return flip(src.transpose([1, 0, 2]), 1) + if rotateMode == ROTATE_180: + return flip(src, -1) + if rotateMode == ROTATE_90_COUNTERCLOCKWISE: + return flip(src.transpose([1, 0, 2]), 0) + return src diff --git a/pymnn/pip_package/MNN/expr/__init__.py b/pymnn/pip_package/MNN/expr/__init__.py index 03fab6d2..b2bc702c 100644 --- a/pymnn/pip_package/MNN/expr/__init__.py +++ b/pymnn/pip_package/MNN/expr/__init__.py @@ -9,23 +9,26 @@ import _mnncengine._expr as _F _numpy_supported = False try: import numpy as np - _numpy_supported = True + _numpy_supported = (type(np.arange(10)) == np.ndarray) except Exception: print ("Numpy not found. Using MNN without numpy.") + def scalar(value, dtype=None): - if dtype == _F.int: - value = _Int(value) - elif dtype == _F.float: - value = _Float(value) + if dtype is not None: + if dtype == _F.int or dtype == _F.uint8: + value = _Int(value) + elif dtype == _F.float: + value = _Float(value) + return _F.const([value], [], _F.NCHW, dtype) if type(value) == type(1): - res = _F.const([value], [], _F.NCHW, _F.int) - return res + return _F.const([value], [], _F.NCHW, _F.int) elif type(value) == type(1.): - res = _F.const([value], [], _F.NCHW, _F.float) - return res + return _F.const([value], [], _F.NCHW, _F.float) else: raise NotImplementedError("not supported data type for creating scalar variable") def _list_shape_type(object, shape=()): + if isinstance(object, _Sequence) and len(object) == 0: + return [0], _F.float if not isinstance(object, _Sequence): if type(object) in (type(1), type(1<<64)): dst_type = _F.int @@ -54,6 +57,7 @@ def _can_broadcast(src_shape, dst_shape): return True def _match_dtype(x, y, dtype=None): def type_val(x): + if x is None: return -1 if x == _F.double: return 4 if x == _F.float: return 3 if x == _F.int64: return 2 @@ -76,15 +80,18 @@ def _to_var(x, dtype=None): return scalar(x, dtype) # 2. numpy if _numpy_supported: - if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var - if x.dtype.kind == 'i': - x = x.astype(np.int32) - x = _F.const(x, x.shape, dtype=_F.int) - elif x.dtype.kine == 'f': - x = x.astype(np.float32) - x = _F.const(x, x.shape, dtype=_F.float) - else: - raise ValueError('Just support i/f dtype numpy.') + try: + if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var + if x.dtype.kind == 'i': + x = x.astype(np.int32) + x = _F.const(x, x.shape, dtype=_F.int) + elif x.dtype.kind == 'f': + x = x.astype(np.float32) + x = _F.const(x, x.shape, dtype=_F.float) + else: + raise ValueError('Just support i/f dtype numpy.') + except: + pass # 3. Sequence if isinstance(x, _Sequence) and x: dst_shape, item_type = _list_shape_type(x) @@ -202,7 +209,7 @@ def floor(x): >>> expr.floor([-5.1, 4.5]) var([-6., 4.]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.floor(x) def round(x): ''' @@ -223,7 +230,7 @@ def round(x): >>> expr.round([-5.1, 4.5]) var([-5., 5.]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.round(x) def ceil(x): ''' @@ -243,7 +250,7 @@ def ceil(x): >>> expr.ceil([-4.9, 4.5]) var([-4., 5.]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.ceil(x) def square(x): ''' @@ -283,7 +290,7 @@ def sqrt(x): >>> expr.sqrt([9., 4.5]) var([3., 2.1213202]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.sqrt(x) def rsqrt(x): ''' @@ -303,7 +310,7 @@ def rsqrt(x): >>> expr.rsqrt([9., 4.5]) var([0.33333334, 0.47140455]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.rsqrt(x) def exp(x): ''' @@ -323,7 +330,7 @@ def exp(x): >>> expr.exp([9., 4.5]) var([8102.449, 90.01698]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.exp(x) def log(x): ''' @@ -343,7 +350,7 @@ def log(x): >>> expr.log([9., 4.5]) var([2.1972246, 1.5040774]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.log(x) def sin(x): ''' @@ -363,7 +370,7 @@ def sin(x): >>> expr.sin([9., 4.5]) var([0.4121185, -0.9775301]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.sin(x) def sinh(x): ''' @@ -384,7 +391,7 @@ def sinh(x): >>> expr.sinh([9., 4.5]) var([4051.542, 45.00301]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.sinh(x) def cos(x): ''' @@ -404,7 +411,7 @@ def cos(x): >>> expr.cos([9., 4.5]) var([-0.91113025, -0.2107958]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.cos(x) def cosh(x): ''' @@ -425,7 +432,7 @@ def cosh(x): >>> expr.cosh([9., 4.5]) var([4051.542, 45.014122]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.cosh(x) def tan(x): ''' @@ -445,7 +452,7 @@ def tan(x): >>> expr.tan([9., 4.5]) var([-0.45231566, 4.637332]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.tan(x) def tanh(x): ''' @@ -466,7 +473,7 @@ def tanh(x): >>> expr.tanh([9., 4.5]) var([1., 0.9997533]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.tanh(x) def asin(x): ''' @@ -487,7 +494,7 @@ def asin(x): >>> expr.asin([9., 0.5]) var([nan, 0.5235988]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.asin(x) def asinh(x): ''' @@ -508,7 +515,7 @@ def asinh(x): >>> expr.asinh([9., 0.5]) var([2.893444, 0.4812118]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.asinh(x) def acos(x): ''' @@ -529,7 +536,7 @@ def acos(x): >>> expr.asin([9., 0.5]) var([nan, 1.0471975]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.acos(x) def acosh(x): ''' @@ -550,7 +557,7 @@ def acosh(x): >>> expr.acosh([9., 0.5]) var([2.887271, nan]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.acosh(x) def atan(x): ''' @@ -571,7 +578,7 @@ def atan(x): >>> expr.atan([9., 0.5]) var([1.4601392, 0.4636476]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.atan(x) def atanh(x): ''' @@ -592,7 +599,7 @@ def atanh(x): >>> expr.atanh([9., 0.5]) var([1.4601392, 0.4636476]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.atanh(x) def reciprocal(x): ''' @@ -612,7 +619,7 @@ def reciprocal(x): >>> expr.reciprocal([9., 0.5]) var([0.11111111, 2.]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.reciprocal(x) def log1p(x): ''' @@ -632,7 +639,7 @@ def log1p(x): >>> expr.log1p([9., 0.5]) var([2.3025851, 0.4054651]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.log1p(x) def gelu(x): ''' @@ -652,7 +659,7 @@ def gelu(x): >>> expr.gelu([9., 0.5]) var([9., 0.345714]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.gelu(x) def sigmoid(x): ''' @@ -672,16 +679,16 @@ def sigmoid(x): >>> expr.sigmoid([9., 0.5]) var([0.9998766, 0.62246716]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.sigmoid(x) def erf(x): - x = _to_var(x) + x = _to_var(x, _F.float) return _F.erf(x) def erfc(x): - x = _to_var(x) + x = _to_var(x, _F.float) return _F.erfc(x) def erfinv(x): - x = _to_var(x) + x = _to_var(x, _F.float) return _F.erfinv(x) def expm1(x): ''' @@ -701,7 +708,7 @@ def expm1(x): >>> expr.expm1([9., 0.5]) var([8.1014492e+03, 6.4869785e-01]) ''' - x = _to_var(x) + x = _to_var(x, _F.float) return _F.expm1(x) def add(x, y): ''' @@ -1479,8 +1486,8 @@ def matmul(a, b, transposeA=False, transposeB=False): var([[0., 1.], [0., 3.]], dtype=float32) ''' - a = _to_var(a, True) - b = _to_var(b, True) + a = _to_var(a, _F.float) + b = _to_var(b, _F.float) return _F.matmul(a, b, transposeA, transposeB) def normalize(x, acrossSpatial, channelShared, eps, scale): ''' @@ -3055,7 +3062,7 @@ def zeros_like(x): Example: ------- >>> expr.zeros_like([[1, 2], [3, 4]]) - array([[0, 0], + var([[0, 0], [0, 0]], dtype=int32) ''' x = _to_var(x) @@ -3078,14 +3085,72 @@ def range(start, limit, delta): Example: ------- >>> expr.range(1.0, 7.0, 2.0) - array([1., 3., 5.], dtype=float32) + var([1., 3., 5.], dtype=float32) ''' start = _to_var(start) limit = _to_var(limit) delta = _to_var(delta) if limit.dtype != start.dtype or delta.dtype != start.dtype: - print(start, limit, delta) raise RuntimeError("parameter start/limit/delta must use same data type, either all int or all float") return _F.range(start, limit, delta) +def sort(x, axis=-1, arg=False, descend=False): + ''' + sort(x, axis=-1, arg=False, descend=False) + Return the sorted array of ``x``. + + Parameters + ---------- + x : var_like, input value. + axis : int, sort by axis. + arg : is ArgSort or not, default is False. + descend : is descend or not, default is False. + + Returns + ------- + sorted_res : Var. + + Example: + ------- + >>> expr.sort([[5, 0], [1, 3]]) + var([[1, 0], + [5, 3]], dtype=int32) + ''' + x = _to_var(x) + # sort will change the x + x = clone(x, True) + return _F.sort(x, axis, arg, descend) +def nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0): + ''' + nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0) + Return the nms array of ``boxes``. + + Parameters + ---------- + boxes : var_like, input value, shape must be [num, 4]. + scores : var_like, input value, shape must be [num]. + max_detections : int. + iou_threshold : float, default is 0. + score_threshold : float, default is float_min. + + Returns + ------- + nms_res : Var. + + Example: + ------- + >>> expr.nms([[1, 1, 4, 4], [0, 0, 3, 3], [5, 5, 7, 7]], [0.9, 0.5, 0.1], 3, 0.1) + var([0, 2], dtype=int32) + ''' + boxes = _to_var(boxes, _F.float) + scores = _to_var(scores, _F.float) + max_detections = _to_int(max_detections) + iou_threshold = _to_float(iou_threshold) + score_threshold = _to_float(score_threshold) + res = _F.nms(boxes, scores, max_detections, iou_threshold, score_threshold) + idx = res >= 0 + idx.fix_as_const() + if _F.reduce_any(idx).read_as_tuple()[0] == 0: + return _F.const([], [0], NCHW, _F.int) + return res[idx] # TODO: detection_post_process -# wrapper for builtin functions end \ No newline at end of file +# wrapper for builtin functions end diff --git a/pymnn/pip_package/MNN/numpy/__init__.py b/pymnn/pip_package/MNN/numpy/__init__.py index 2dc80afd..d7aa9979 100644 --- a/pymnn/pip_package/MNN/numpy/__init__.py +++ b/pymnn/pip_package/MNN/numpy/__init__.py @@ -19,6 +19,16 @@ inf = float('inf') # helper functions def __not_impl(*args): raise NotImplementedError('MNN.numpy not implemet this function now.') +def __get_arg(kargs, key, default=None): + if key in kargs: return kargs[key] + return default +def __get_shape(args): + if type(args) not in (tuple, list): + return [args] + elif len(args) == 1 and type(args[0]) in (tuple, list): + return args[0] + else: + return args def __order_assert(order): if order is not None and order not in 'CK': raise RuntimeError("MNN.numpy just support order=\"C|K\"") @@ -89,6 +99,7 @@ def identity(n, dtype=float32): return eye(n, dtype=dtype) def full(shape, fill_value, dtype=None, order='C'): __order_assert(order) + shape = __get_shape(shape) return _F.fill(_F._to_var(shape), _F.scalar(fill_value, dtype)) def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None): dst_dtype, dst_shape = __array_like_type(a, dtype, order, shape) @@ -165,10 +176,14 @@ def __arange_3(start, stop, step=1, dtype=None): def __arange_1(stop, dtype=None): return __arange_3(0, stop, 1, dtype) def arange(*args, **kargs): - if 'dtype' in kargs: dtype=kargs['dtype'] - else: dtype = None - if len(args) == 1: + dtype = __get_arg(kargs, 'dtype') + step = __get_arg(kargs, 'step') + stop = __get_arg(kargs, 'stop') + start = __get_arg(kargs, 'start') + if len(args) == 1 and stop is None and step is None: return __arange_1(args[0], dtype) + if len(args) == 2 and step is not None: + return __arange_3(*args, step=step, dtype=dtype) if len(args) == 4: return __arange_3(*args) return __arange_3(*args, dtype=dtype) @@ -189,7 +204,26 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0): base = pow(stop / _F._Float(start), 1./ num) start = math.log(start, base) return logspace(start, _F._Float(num), num, endpoint, base, dtype, axis) -def meshgrid(xi, copy=True, sparse=False, indexing='xy'): __not_impl() +def meshgrid(*xi, **kwargs): + copy = __get_arg(kwargs, 'copy', True) + sparse = __get_arg(kwargs, 'sparse', False) + indexing = __get_arg(kwargs, 'indexing', 'xy') + ndim = len(xi) + if indexing not in ['xy', 'ij']: + raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.") + + s0 = (1,) * ndim + output = [asanyarray(x).reshape(s0[:i] + (-1,) + s0[i + 1:]) for i, x in enumerate(xi)] + if indexing == 'xy' and ndim > 1: + # switch first and second axis + output[0] = swapaxes(output[0], 0, 1) + output[1] = swapaxes(output[1], 0, 1) + if not sparse: + # Return the full N-D matrix (not only the 1-D vector) + output = broadcast_arrays(*output) + if copy: + output = [x.copy() for x in output] + return output # 4. Building matrices def diag(v, k=0):__not_impl() def diagflat(v, k=0):__not_impl() @@ -212,11 +246,11 @@ def copyto(dst, src, casting='same_kind', where=True): def shape(a): return tuple(a.shape) # 2. Changing array shape -def reshape(a, newshape, order='C'): - __order_assert(order) +def reshape(a, *newshape): + newshape = __get_shape(newshape) return _F.reshape(a, newshape) def ravel(a, order='C'): - return reshape(a, [-1], order) + return reshape(a, [-1]) # 3. Transpose-like operations def moveaxis(a, source, destination): ndim = a.ndim @@ -431,7 +465,9 @@ right_shift = packbits = unpackbits = binary_repr = base_repr = __not_impl # String operations [Not Impl] # Indexing routines # 1. Generating index arrays -def where(condition, x, y): +def where(condition, x=None, y=None): + if x is None and y is None: + return nonzero(condition) return _F.select(condition, x, y) def indices(dimensions, dtype=int32, sparse=False):__not_impl() def ix_(*args):__not_impl() @@ -546,6 +582,7 @@ arccosh = _F.acosh arctanh = _F.atanh around = _F.round round_ = _F.round +round = _F.round rint = _F.round fix = _F.round floor = _F.floor @@ -685,9 +722,12 @@ def pad(array, pad_width, mode='constant'): return _F.pad(array, pad_width, mode) # Sorting, searching, and counting # 1. Sorting -def sort(a, axis=- 1, kind=None, order=None):__not_impl() -def lexsort(keys, axis=-1):__not_impl() -def argsort(a, axis=-1, kind=None, order=None): __not_impl() +def sort(a, axis=- 1, kind=None, order=None): + return _F.sort(a, axis) +def lexsort(keys, axis=-1): + return sort(keys, axis) +def argsort(a, axis=-1, kind=None, order=None): + return _F.sort(a, axis, True) def msort(a): return sort(a, axis=0) def sort_complex(a): __not_impl() def partition(a, kth, axis=- 1, kind='introselect', order=None): __not_impl() @@ -704,6 +744,7 @@ def argwhere(a): mask = not_equal(a, _F.scalar(0, a.dtype)) return _F.where(mask) def nonzero(a): + res = _F.where(a) res = argwhere(a) if a.ndim == 1: return (ravel(res),) @@ -762,6 +803,13 @@ corrcoef = correlate = cov = __not_impl histogram = histogram2d = histogramdd = bincount = histogram_bin_edges = digitize = __not_impl # numpy ndarray functions +def __item(self, idx): + if type(idx) == type(1): + return ravel(self)[idx] + elif type(idx) == tuple: + return self[idx] + else: + raise ValueError('item arg must be int or tuple.') __override_operator(_F.Var, "all", all) __override_operator(_F.Var, "any", any) __override_operator(_F.Var, "argmax", argmax) @@ -793,6 +841,7 @@ __override_operator(_F.Var, "sum", sum) __override_operator(_F.Var, "swapaxes", swapaxes) __override_operator(_F.Var, "transpose", transpose) __override_operator(_F.Var, "var", var) +__override_operator(_F.Var, "item", __item) from . import random -from . import linalg \ No newline at end of file +from . import linalg diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py index 809dda51..ac2b9cbe 100644 --- a/pymnn/pip_package/build_deps.py +++ b/pymnn/pip_package/build_deps.py @@ -15,6 +15,10 @@ USE_TRT=False if len(sys.argv) > 1 and sys.argv[1] == '-trt': USE_TRT=True +IS_INTERNAL_BUILD = False +if os.path.isdir('../../schema/private'): + IS_INTERNAL_BUILD = True + def build_deps(): """ build depency """ root_dir = os.path.dirname(os.path.dirname(os.getcwd())) @@ -31,15 +35,16 @@ def build_deps(): elif IS_LINUX: extra_opts = '-DMNN_TENSORRT=ON \ -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' ' + extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' ' os.system('cmake ' + extra_opts + '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\ -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \ - -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4') + -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4') else: os.system('cmake -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\ -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON\ -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \ - .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4') + .. && make MNN MNNTrain MNNConvert -j4') ################################################################################ # Building dependent libraries ################################################################################ diff --git a/pymnn/pip_package/build_wheel.py b/pymnn/pip_package/build_wheel.py index adc1596d..d028c811 100644 --- a/pymnn/pip_package/build_wheel.py +++ b/pymnn/pip_package/build_wheel.py @@ -8,6 +8,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False, help='build wheel for 32bit arch, only usable on windows') parser.add_argument('--version', dest='version', type=str, required=True, help='MNN dist version') +parser.add_argument('--serving', dest='serving', action='store_true', default=False, + help='build for internal serving, default False') +parser.add_argument('--env', dest='env', type=str, required=False, + help='build environment, e.g. :daily/pre/production') args = parser.parse_args() import os @@ -23,6 +27,8 @@ if __name__ == '__main__': comm_args = '--version ' + args.version if IS_LINUX: comm_args += ' --plat-name=manylinux1_x86_64' + comm_args += ' --env ' + args.env if args.env else '' + comm_args += ' --serving' if args.serving else '' if IS_WINDOWS: os.putenv('DISTUTILS_USE_SDK', '1') os.putenv('MSSdk', '1') diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py index 0aadd7c5..20d3b071 100644 --- a/pymnn/pip_package/setup.py +++ b/pymnn/pip_package/setup.py @@ -10,6 +10,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False, help='build wheel for 32bit arch, only usable on windows') parser.add_argument('--version', dest='version', type=str, required=True, help='MNN dist version') +parser.add_argument('--serving', dest='serving', action='store_true', default=False, + help='build for internal serving, default False') +parser.add_argument('--env', dest='env', type=str, required=False, + help='build environment, e.g. :daily/pre/production') args, unknown = parser.parse_known_args() sys.argv = [sys.argv[0]] + unknown @@ -27,7 +31,7 @@ IS_WINDOWS = (platform.system() == 'Windows') IS_DARWIN = (platform.system() == 'Darwin') IS_LINUX = (platform.system() == 'Linux') BUILD_DIR = 'pymnn_build' -BUILD_TYPE = 'RELEASE' +BUILD_TYPE = 'REL_WITH_DEB_INFO' BUILD_ARCH = 'x64' if args.x86: BUILD_ARCH = '' @@ -42,10 +46,12 @@ def report(*args): package_name = 'MNN' USE_TRT=check_env_flag('USE_TRT') +IS_INTERNAL_BUILD = False print ("USE_TRT ", USE_TRT) if os.path.isdir('../../schema/private'): + IS_INTERNAL_BUILD = True if USE_TRT: print("Build Internal NNN with TRT") package_name = 'MNN_Internal_TRT' @@ -81,16 +87,19 @@ def configure_extension_build(): # extra_link_args = ['/NODEFAULTLIB:LIBCMT.LIB'] # /MD links against DLL runtime # and matches the flags set for protobuf and ONNX - # /Z7 turns on symbolic debugging information in .obj files + # /Zi turns on symbolic debugging information in separate .pdb (which is same as MNN.pdb) # /EHa is about native C++ catch support for asynchronous # structured exception handling (SEH) # /DNOMINMAX removes builtin min/max functions # /wdXXXX disables warning no. XXXX - extra_compile_args = ['/MT', '/Z7', + # Some macro (related with __VA_ARGS__) defined in pymnn/src/util.h can not be process correctly + # becase of MSVC bug, enable /experimental:preprocessor fix it (And Windows SDK >= 10.0.18362.1) + extra_compile_args = ['/MT', '/Zi', '/EHa', '/DNOMINMAX', '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838', '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996', - '/wd4275'] + '/wd4275', '/experimental:preprocessor'] + extra_link_args = [] else: extra_link_args = [] extra_compile_args = [ @@ -115,7 +124,11 @@ def configure_extension_build(): ] if check_env_flag('WERROR'): extra_compile_args.append('-Werror') - extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS'] + extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS'] + if IS_LINUX and IS_INTERNAL_BUILD and args.serving: + extra_compile_args += ['-DPYMNN_INTERNAL_SERVING'] + if args.env == 'daily': + extra_compile_args += ['-DPYMNN_INTERNAL_SERVING_DAILY'] root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd()))) engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API'] engine_libraries = [] @@ -123,13 +136,21 @@ def configure_extension_build(): engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")] engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")] engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")] - print(engine_library_dirs) if USE_TRT: # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system. engine_library_dirs += ['/usr/local/cuda/lib64/'] + # Logging is enabled on Linux. Add the dependencies. + if IS_LINUX and IS_INTERNAL_BUILD: + engine_library_dirs += ['/usr/include/curl/'] + + print(engine_library_dirs) engine_link_args = [] engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")] + if IS_LINUX and IS_INTERNAL_BUILD and args.serving: + engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "monitor_service.cc")] + engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "verify_service.cc")] + engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "http_util.cc")] engine_include_dirs = [os.path.join(root_dir, "include")] engine_include_dirs += [os.path.join(root_dir, "express")] engine_include_dirs += [os.path.join(root_dir, "express", "module")] @@ -146,13 +167,19 @@ def configure_extension_build(): engine_include_dirs += [os.path.join(root_dir, "schema", "current")] engine_include_dirs += [os.path.join(root_dir, "3rd_party",\ "flatbuffers", "include")] + if IS_LINUX and IS_INTERNAL_BUILD and args.serving: + engine_include_dirs += [os.path.join(root_dir, "3rd_party", "rapidjson")] # cv include engine_include_dirs += [os.path.join(root_dir, "tools", "cv", "include")] engine_include_dirs += [np.get_include()] trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart'] engine_depend = ['-lMNN'] - engine_depend = ['-lMNN', '-lMNNOpenCV'] + + # enable logging & model authentication on linux. + if IS_LINUX and IS_INTERNAL_BUILD: + engine_depend += ['-lcurl', '-lssl', '-lcrypto'] + if USE_TRT: engine_depend += trt_depend @@ -167,6 +194,9 @@ def configure_extension_build(): # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system. tools_library_dirs += ['/usr/local/cuda/lib64/'] + if IS_LINUX and IS_INTERNAL_BUILD: + tools_library_dirs += ['/usr/include/curl/'] + tools_link_args = [] tools_sources = [os.path.join(root_dir, "pymnn", "src", "MNNTools.cc")] tools_sources += [os.path.join(root_dir, "tools", "quantization",\ @@ -195,61 +225,67 @@ def configure_extension_build(): tools_include_dirs += [os.path.join(root_dir, "source")] tools_include_dirs += [np.get_include()] + tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf'] + # enable logging and model authentication on linux. + if IS_LINUX and IS_INTERNAL_BUILD: + tools_depend += ['-lcurl', '-lssl', '-lcrypto'] if USE_TRT: tools_depend += trt_depend - engine_extra_link_args = [] - tools_extra_link_args = [] if IS_DARWIN: - engine_extra_link_args += ['-Wl,-all_load'] - engine_extra_link_args += engine_depend - engine_extra_link_args += ['-Wl,-noall_load'] + engine_link_args += ['-Wl,-all_load'] + engine_link_args += engine_depend + engine_link_args += ['-Wl,-noall_load'] if IS_LINUX: - engine_extra_link_args += ['-Wl,--whole-archive'] - engine_extra_link_args += engine_depend - engine_extra_link_args += ['-fopenmp'] - engine_extra_link_args += ['-Wl,--no-whole-archive'] + engine_link_args += ['-Wl,--whole-archive'] + engine_link_args += engine_depend + engine_link_args += ['-fopenmp'] + engine_link_args += ['-Wl,--no-whole-archive'] if IS_WINDOWS: - engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib'] + engine_link_args += ['/WHOLEARCHIVE:MNN.lib'] if IS_DARWIN: - tools_extra_link_args += ['-Wl,-all_load'] - tools_extra_link_args += tools_depend - tools_extra_link_args += ['-Wl,-noall_load'] + tools_link_args += ['-Wl,-all_load'] + tools_link_args += tools_depend + tools_link_args += ['-Wl,-noall_load'] if IS_LINUX: - tools_extra_link_args += ['-Wl,--whole-archive'] - tools_extra_link_args += tools_depend - tools_extra_link_args += ['-fopenmp'] - tools_extra_link_args += ['-Wl,--no-whole-archive'] - tools_extra_link_args += ['-lz'] + tools_link_args += ['-Wl,--whole-archive'] + tools_link_args += tools_depend + tools_link_args += ['-fopenmp'] + tools_link_args += ['-Wl,--no-whole-archive'] + tools_link_args += ['-lz'] if IS_WINDOWS: - tools_extra_link_args += ['/WHOLEARCHIVE:MNN.lib'] - tools_extra_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib'] + tools_link_args += ['/WHOLEARCHIVE:MNN.lib'] + tools_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib'] + tools_link_args += ['libprotobuf.lib'] # use wholearchive will cause lnk1241 (version.rc specified) if BUILD_TYPE == 'DEBUG': + # Need pythonxx_d.lib, which seem not exist in miniconda ? if IS_WINDOWS: - extra_link_args.append('/DEBUG:FULL') + extra_compile_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd'] + extra_link_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd'] else: extra_compile_args += ['-O0', '-g'] extra_link_args += ['-O0', '-g'] if BUILD_TYPE == 'REL_WITH_DEB_INFO': if IS_WINDOWS: - extra_link_args.append('/DEBUG:FULL') + extra_compile_args += ['/DEBUG'] + extra_link_args += ['/DEBUG', '/OPT:REF', '/OPT:ICF'] else: extra_compile_args += ['-g'] extra_link_args += ['-g'] - +# compat with py39 def make_relative_rpath(path): """ make rpath """ if IS_DARWIN: - return '-Wl,-rpath,@loader_path/' + path + return ['-Wl,-rpath,@loader_path/' + path] elif IS_WINDOWS: - return '' + return [] else: - return '-Wl,-rpath,$ORIGIN/' + path + return ['-Wl,-rpath,$ORIGIN/' + path] ################################################################################ # Declare extensions and package @@ -263,8 +299,8 @@ def configure_extension_build(): extra_compile_args=engine_compile_args + extra_compile_args,\ include_dirs=engine_include_dirs,\ library_dirs=engine_library_dirs,\ - extra_link_args=engine_extra_link_args + engine_link_args\ - + [make_relative_rpath('lib')]) + extra_link_args=engine_link_args + extra_link_args\ + + make_relative_rpath('lib')) extensions.append(engine) tools = Extension("_tools",\ libraries=tools_libraries,\ @@ -273,8 +309,8 @@ def configure_extension_build(): extra_compile_args=tools_compile_args + extra_compile_args,\ include_dirs=tools_include_dirs,\ library_dirs=tools_library_dirs,\ - extra_link_args=tools_extra_link_args +tools_link_args\ - + [make_relative_rpath('lib')]) + extra_link_args=tools_link_args + extra_link_args\ + + make_relative_rpath('lib')) extensions.append(tools) # These extensions are built by cmake and copied manually in build_extensions() # inside the build_ext implementaiton diff --git a/pymnn/src/MNN.cc b/pymnn/src/MNN.cc index 06d91481..5805f52c 100644 --- a/pymnn/src/MNN.cc +++ b/pymnn/src/MNN.cc @@ -19,7 +19,9 @@ static int tls_key_2 = 0; #include #include using namespace MNN::Express; +#ifdef PYMNN_OPENCV_API #include "cv/cv.hpp" +#endif #endif // PYMNN_EXPR_API #ifdef BUILD_OPTYPE @@ -64,6 +66,12 @@ using RegularizationMethod = ParameterOptimizer::RegularizationMethod; #endif #endif +#ifdef PYMNN_INTERNAL_SERVING +#include +#include "internal/monitor_service.h" +#include "internal/verify_service.h" +#endif + struct MNN_TLSData { PyObject *PyMNNHalideTypeInt = NULL; PyObject *PyMNNHalideTypeInt64 = NULL; @@ -187,6 +195,10 @@ static PyObject* PyMNNInterpreter_new(struct _typeobject *type, PyObject *args, static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObject *kwds); static void PyMNNInterpreter_dealloc(PyMNNInterpreter *); +#ifdef PYMNN_INTERNAL_SERVING +static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args); +#endif + static PyMethodDef PyMNNInterpreter_methods[] = { {"createRuntime", (PyCFunction)PyMNNInterpreter_createRuntime, METH_VARARGS | METH_STATIC, "create runtime"}, {"createSession", (PyCFunction)PyMNNInterpreter_createSession, METH_VARARGS, "create session"}, @@ -205,6 +217,9 @@ static PyMethodDef PyMNNInterpreter_methods[] = { {"cache", (PyCFunction)PyMNNInterpreter_cache, METH_VARARGS, "cache current net instance"}, {"removeCache", (PyCFunction)PyMNNInterpreter_removeCache, METH_VARARGS, "remove cache with given path"}, {"updateSessionToModel", (PyCFunction)PyMNNInterpreter_updateSessionToModel, METH_VARARGS, "updateSessionToModel"}, +#ifdef PYMNN_INTERNAL_SERVING + {"createSessionWithToken", (PyCFunction)PyMNNInterpreter_createSessionWithToken, METH_VARARGS, "create session with token"}, +#endif {NULL} /* Sentinel */ }; @@ -681,13 +696,7 @@ static PyObject* PyMNNInterpreter_createRuntime(PyObject* self, PyObject* args) return res; } -static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) { - PyMNNInterpreter* instance = (PyMNNInterpreter *)self; - PyObject* dict = NULL, *rtinfo_py = NULL; - if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) { - return NULL; - } - +static PyObject* createSession(PyMNNInterpreter *self, PyObject* dict, PyObject *rtinfo_py) { PyObject *f = importName("MNN", "Session"); if (!f || !PyCallable_Check(f)) { PyErr_SetString(PyExc_Exception, @@ -715,10 +724,10 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject } Session* s; if (rtinfo_py == NULL) { - s = instance->interpreter->createSession(config.second.first); + s = self->interpreter->createSession(config.second.first); } else { auto runtimeinfo = *(RuntimeInfo*)PyCapsule_GetPointer(rtinfo_py, NULL); - s = instance->interpreter->createSession(config.second.first, runtimeinfo); + s = self->interpreter->createSession(config.second.first, runtimeinfo); } if (!s) { PyErr_SetString(PyExc_Exception, @@ -727,11 +736,54 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject } session->session = s; - session->modelPath = instance->modelPath; + session->modelPath = self->modelPath; return (PyObject *)session; } +static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) { +#ifdef PYMNN_INTERNAL_SERVING + PyErr_SetString(PyExc_Exception, + "PyMNNInterpreter_createSession: unsupported interface, should use createSessionWithToken."); + return NULL; +#endif + PyMNNInterpreter* instance = (PyMNNInterpreter *)self; + PyObject* dict = NULL, *rtinfo_py = NULL; + if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) { + return NULL; + } + + return createSession(instance, dict, rtinfo_py); +} + +#ifdef PYMNN_INTERNAL_SERVING +static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args) { + PyMNNInterpreter* instance = (PyMNNInterpreter *)self; + PyObject* dict = NULL, *rtinfo_py = NULL; + char *token = NULL; + char *scene = NULL; + char *app_key = NULL; + if (!PyArg_ParseTuple(args, "sss|OO", &token, &scene, &app_key, &dict, &rtinfo_py)) { + return NULL; + } + + if (!token || !scene || !app_key) { + PyErr_SetString(PyExc_Exception, + "PyMNNInterpreter_createSessionWithToken: input invalid, token, scene or app_key is null."); + return NULL; + } + + bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key)); + if (!ret) { + PyErr_SetString(PyExc_Exception, + "PyMNNNN_load_module_from_file_with_token: check token failed, return null session."); + return NULL; + } + + return createSession(instance, dict, rtinfo_py); +} +#endif + static PyObject* PyMNNInterpreter_resizeSession(PyMNNInterpreter *self, PyObject *args) { PyMNNSession* session = NULL; if (!PyArg_ParseTuple(args, "O", &session)) { @@ -826,12 +878,27 @@ static PyObject* PyMNNInterpreter_runSession(PyMNNInterpreter *self, PyObject *a } ErrorCode r = NO_ERROR; Py_BEGIN_ALLOW_THREADS + +#ifdef PYMNN_INTERNAL_SERVING + Timer timer; r = self->interpreter->runSession(session->session); + float cost_time = (float)timer.durationInUs() / (float)1000; + MNN::Interpreter::SessionInfoCode info_type = MNN::Interpreter::BACKENDS; + int backendType[MNN_FORWARD_ALL]; + self->interpreter->getSessionInfo(session->session, info_type, backendType); + std::string mBizCode = self->interpreter->bizCode() ? self->interpreter->bizCode() : ""; + std::string mUuid = self->interpreter->uuid() ? self->interpreter->uuid() : ""; + MonitorService::GetInstance().Track(cost_time, std::to_string(*backendType), "RUN_SESSION", + "PyMNNInterpreter_runSession", std::to_string(r), mBizCode, mUuid); +#else + r = self->interpreter->runSession(session->session); +#endif + Py_END_ALLOW_THREADS return PyLong_FromLong(r); } static PyMNNTensor* getTensor() { - PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)&PyMNNTensorType, PyTuple_New(0), NULL); + PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNTensorType), PyTuple_New(0), NULL); if (tensor) { tensor->tensor = nullptr; } @@ -1222,6 +1289,12 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec return -1; } +#ifdef PYMNN_INTERNAL_SERVING + // initialize MonitorService + MonitorService::GetInstance().Start(); + VerifyService::GetInstance().Start(); +#endif + return 0; } @@ -1315,7 +1388,7 @@ static PyObject* PyMNNSession_removeCache(PyMNNSession *self, PyObject *args) { /// MNN Tensor implementation bool isTensor(PyObject* t) { - return PyObject_IsInstance(t, (PyObject*)&PyMNNTensorType); + return PyObject_IsInstance(t, (PyObject*)PyType_FindTLSType(&PyMNNTensorType)); } Tensor* toTensor(PyObject* t) { return ((PyMNNTensor*)t)->tensor; @@ -1337,17 +1410,32 @@ static void PyMNNTensor_dealloc(PyMNNTensor *self) { static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) { int argc = PyTuple_Size(args); - PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr; - long dimensionType; + PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr, *input_var = nullptr; + long dimensionType = -1; bool parse_res = false; switch (argc) { case 0: // just return, using in `PyMNNInterpreter_getSessionInputAll`; return 0; +#ifdef PYMNN_EXPR_API + case 1: + parse_res = PyArg_ParseTuple(args, "O", &input_var) + && isVar(input_var); + break; + case 2: + parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType) + && (isTensor(input_tensor) || isVar(input_tensor)); + if (isVar(input_tensor)) { + input_var = input_tensor; + input_tensor = nullptr; + } + break; +#else case 2: parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType) && isTensor(input_tensor); break; +#endif case 3: parse_res = PyArg_ParseTuple(args, "OOl", &shape, &dataType, &dimensionType) && isInts(shape); @@ -1361,11 +1449,35 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) { } if (!parse_res) { PyMNN_ERROR_LOG("Tensor init require args as belows:\n" - "\t1. (Tensor, DimensionType)\n" + "\t0. (Var)\n" + "\t1. (Tensor/Var, DimensionType)\n" "\t2. ([int], DataType, DimensionType)\n" "\t3. ([int], DataType, tuple/ndarray, DimensionType)\n"); + return -1; } - +#ifdef PYMNN_EXPR_API + // 0. create Tensor by Var + if (input_var) { + auto var = toVar(input_var); + auto info = var->getInfo(); + void* ptr = const_cast(var->readMap()); + Tensor::DimensionType type = Tensor::TENSORFLOW; + if (dimensionType < 0) { + if (info->order == NCHW) type = Tensor::CAFFE; + else if (info->order == NC4HW4) type = Tensor::CAFFE_C4; + } else { + type = static_cast(dimensionType); + } + Tensor *tensor = Tensor::create(info->dim, info->type, ptr, type); + if (!tensor) { + PyMNN_ERROR_LOG("PyMNNTensor_create: Tensor create failed"); + return -1; + } + self->tensor = tensor; + self->owner = 2; + return 0; + } +#endif // 1. create Tensor by Tensor if (input_tensor) { Tensor *tensor = new Tensor(toTensor(input_tensor), (Tensor::DimensionType)dimensionType, true); @@ -1809,8 +1921,12 @@ static PyObject* PyMNNCVImageProcess_convert(PyMNNCVImageProcess *self, PyObject return NULL; } - if (PyLong_Check(source)) { - ErrorCode ret = self->imageProcess->convert(reinterpret_cast(PyLong_AsLong(source)), + if (isInt(source)) { + auto ptr = PyLong_AsVoidPtr(source); + if (ptr == NULL) { + Py_RETURN_NONE; + } + ErrorCode ret = self->imageProcess->convert(reinterpret_cast(ptr), iw, ih, stride, ((PyMNNTensor *)dest)->tensor); return PyLong_FromLong(ret); @@ -1949,46 +2065,70 @@ static PyObject* PyMNNCVImageProcess_setPadding(PyMNNCVImageProcess *self, PyObj /// MNN CVMatrix implementation bool isMatrix(PyObject* obj) { - return PyObject_IsInstance(obj, (PyObject*)&PyMNNCVMatrixType); + return PyObject_IsInstance(obj, (PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType)); } CV::Matrix toMatrix(PyObject* obj) { return *(((PyMNNCVMatrix*)obj)->matrix); } PyObject* toPyObj(CV::Matrix m) { - PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)&PyMNNCVMatrixType, PyTuple_New(0), NULL); + PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType), PyTuple_New(0), NULL); ret->matrix = new CV::Matrix(); *(ret->matrix) = m; return (PyObject*)ret; } -bool isSize(PyObject* obj) { - return (isInts(obj) && toInts(obj).size() == 2); -} -CV::Size toSize(PyObject* obj) { - auto vals = toInts(obj); - MNN_ASSERT(val.size() == 2); - return CV::Size(vals[0], vals[1]); -} + bool isPoint(PyObject* obj) { - return (isFloats(obj) && toFloats(obj).size() == 2); + return (isFloats(obj) && toFloats(obj).size() == 2) || + (isInts(obj) && toInts(obj).size() == 2); } CV::Point toPoint(PyObject* obj) { - auto vals = toFloats(obj); - MNN_ASSERT(val.size() == 2); CV::Point point; - point.set(vals[0], vals[1]); + if (isFloats(obj)) { + auto vals = toFloats(obj); + MNN_ASSERT(val.size() == 2); + point.set(vals[0], vals[1]); + } else if (isInts(obj)) { + auto vals = toInts(obj); + MNN_ASSERT(val.size() == 2); + point.set(vals[0], vals[1]); + } return point; } bool isPoints(PyObject* obj) { - return (isFloats(obj) && toFloats(obj).size() % 2 == 0); + return (isFloats(obj) && toFloats(obj).size() % 2 == 0) || + (isInts(obj) && toInts(obj).size() % 2 == 0) || isVar(obj); } std::vector toPoints(PyObject* obj) { - auto vals = toFloats(obj); - MNN_ASSERT(val.size() % 2 == 0); - std::vector points(vals.size() / 2); - for (int i = 0; i < points.size(); i++) { - points[i].set(vals[i*2], vals[i*2+1]); + if (isFloats(obj)) { + auto vals = toFloats(obj); + MNN_ASSERT(vals.size() % 2 == 0); + std::vector points(vals.size() / 2); + for (int i = 0; i < points.size(); i++) { + points[i].set(vals[i*2], vals[i*2+1]); + } + return points; } - return points; + if (isInts(obj)) { + auto vals = toInts(obj); + MNN_ASSERT(vals.size() % 2 == 0); + std::vector points(vals.size() / 2); + for (int i = 0; i < points.size(); i++) { + points[i].set(vals[i*2], vals[i*2+1]); + } + return points; + } + if (isVar(obj)) { + auto vals = toVar(obj); + auto size = vals->getInfo()->size; + MNN_ASSERT(size % 2 == 0); + std::vector points(size / 2); + auto ptr = vals->readMap(); + for (int i = 0; i < points.size(); i++) { + points[i].set(ptr[i*2], ptr[i*2+1]); + } + return points; + } + return {}; } PyObject* toPyObj(std::vector _points) { std::vector points(_points.size() * 2); @@ -2494,7 +2634,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { PyErr_SetString(PyExc_Exception, "initMNN.expr: PyType_Ready PyMNNVarType failed"); ERROR_RETURN } - PyModule_AddObject(expr_module, "Var", (PyObject *)&PyMNNVarType); + PyModule_AddObject(expr_module, "Var", (PyObject *)PyType_FindTLSType(&PyMNNVarType)); // def enum def_data_format(expr_module); def_dtype(expr_module); @@ -2547,6 +2687,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) { def_ThresholdTypes(cv_module); def_RetrievalModes(cv_module); def_ContourApproximationModes(cv_module); + def_LineTypes(cv_module); // add methods of cv constexpr int cv_method_num = sizeof(PyMNNCV_methods) / sizeof(PyMethodDef); for (int i = 0; i < cv_method_num; i++) { @@ -2571,6 +2712,10 @@ void loadMNN() { WeImport_AppendInittab(MOD_NAME, MOD_INIT_FUNC); }); } +void* memoryToVar(const void* ptr, int h, int w, int c, int type) { + auto var = Express::_Const(ptr, {h, w, c}, NHWC, dtype2htype(static_cast(type))); + return reinterpret_cast(toPyObj(var)); +} static auto registerMNN = []() { loadMNN(); return true; diff --git a/pymnn/src/MNNPyBridge.h b/pymnn/src/MNNPyBridge.h index 1027b2ea..9e702f61 100644 --- a/pymnn/src/MNNPyBridge.h +++ b/pymnn/src/MNNPyBridge.h @@ -17,4 +17,12 @@ #define PYMNN_PUBLIC #endif // WIN32 -extern "C" PYMNN_PUBLIC void loadMNN(); \ No newline at end of file +// memoryToVar's type define +#define TypeFloat 1 +#define TypeDouble 2 +#define TypeInt 3 +#define TypeUint8 4 +#define TypeInt8 6 +#define TypeInt64 9 +extern "C" PYMNN_PUBLIC void loadMNN(); +extern "C" PYMNN_PUBLIC void* memoryToVar(void* ptr, int h, int w, int c, int type); \ No newline at end of file diff --git a/pymnn/src/cv.h b/pymnn/src/cv.h index 14df2793..7e8fac4d 100644 --- a/pymnn/src/cv.h +++ b/pymnn/src/cv.h @@ -99,10 +99,22 @@ def_enum(ContourApproximationModes, CV::ContourApproximationModes, CV::CHAIN_APPROX_TC89_L1, "CHAIN_APPROX_TC89_L1", CV::CHAIN_APPROX_TC89_KCOS, "CHAIN_APPROX_TC89_KCOS" ) +def_enum(LineTypes, CV::LineTypes, + CV::FILLED, "FILLED", + CV::LINE_4, "LINE_4", + CV::LINE_8, "LINE_8", + CV::LINE_AA, "LINE_AA" + ) // helper functions INTS default_size = {0, 0}, default_param = {}; -bool isSize(PyObject* obj); -CV::Size toSize(PyObject* obj); +bool isSize(PyObject* obj) { + return (isInts(obj) && toInts(obj).size() == 2); +} +CV::Size toSize(PyObject* obj) { + auto vals = toInts(obj); + MNN_ASSERT(val.size() == 2); + return CV::Size(vals[0], vals[1]); +} bool isPoint(PyObject* obj); CV::Point toPoint(PyObject* obj); bool isPoints(PyObject* obj); @@ -378,24 +390,28 @@ static PyObject* PyMNNCV_invertAffineTransform(PyObject *self, PyObject *args) { } PyMNN_ERROR("invertAffineTransform require args: (Matrix)"); } +std::vector default_floats = {}; static PyObject* PyMNNCV_resize(PyObject *self, PyObject *args) { - PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR); + PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR), + *mean = toPyObj(default_floats), *norm = toPyObj(default_floats); float fx = 0, fy = 0; - if (PyArg_ParseTuple(args, "OO|ffO", &src, &dsize, &fx, &fy, &interpolation) && - isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation)) { - return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum(interpolation))); + int code = -1; + if (PyArg_ParseTuple(args, "OO|ffOiOO", &src, &dsize, &fx, &fy, &interpolation, &code, &mean, &norm) && + isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation) && isFloats(mean) && isFloats(norm)) { + return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum(interpolation), code, toFloats(mean), toFloats(norm))); } - PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags)"); + PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags, int, [float], [float])"); } static PyObject* PyMNNCV_warpAffine(PyObject *self, PyObject *args) { - PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT); - int borderValue = 0; - if (PyArg_ParseTuple(args, "OOO|OOi", &src, &M, &dsize, &flag, &borderMode, &borderValue) && - isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode)) { + PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT), + *mean = toPyObj(default_floats), *norm = toPyObj(default_floats); + int borderValue = 0, code = -1; + if (PyArg_ParseTuple(args, "OOO|OOiiOO", &src, &M, &dsize, &flag, &borderMode, &borderValue, &code, &mean, &norm) && + isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode) && isFloats(mean) && isFloats(norm)) { return toPyObj(CV::warpAffine(toVar(src), toMatrix(M), toSize(dsize), - toEnum(flag), toEnum(borderMode), borderValue)); + toEnum(flag), toEnum(borderMode), borderValue, code, toFloats(mean), toFloats(norm))); } - PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int)"); + PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int, int, [float], [float])"); } static PyObject* PyMNNCV_warpPerspective(PyObject *self, PyObject *args) { PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT); @@ -433,7 +449,7 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) { auto contours = CV::findContours(toVar(image), toEnum(mode), toEnum(method), toPoint(offset)); PyObject* obj = PyTuple_New(2); - PyTuple_SetItem(obj, 0, toPyObj, toPyObj>(contours)); + PyTuple_SetItem(obj, 0, toPyObj(contours)); PyTuple_SetItem(obj, 1, toPyObj("no hierarchy")); return obj; } @@ -442,24 +458,29 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) { static PyObject* PyMNNCV_contourArea(PyObject *self, PyObject *args) { PyObject *points; int oriented = 0; - if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isPoints(points)) { - float area = CV::contourArea(toPoints(points), oriented); - return toPyObj(area); + if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isVar(points)) { + float res = CV::contourArea(toVar(points), oriented); + return toPyObj(res); } - PyMNN_ERROR("contourArea require args: ([float], |bool)"); + PyMNN_ERROR("contourArea require args: (Var, |bool)"); } static PyObject* PyMNNCV_convexHull(PyObject *self, PyObject *args) { PyObject *points; int clockwise = 0, returnPoints = 1; - if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isPoints(points)) { - return toPyObj(CV::convexHull(toPoints(points), clockwise, returnPoints)); + if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isVar(points)) { + auto res = CV::convexHull(toVar(points), clockwise, returnPoints); + if (returnPoints) { + int npoints = res.size() / 2; + return toPyObj(Express::_Const(res.data(), { npoints, 1, 2 }, NHWC, halide_type_of())); + } + return toPyObj(res); } - PyMNN_ERROR("convexHull require args: ([float], |bool, bool)"); + PyMNN_ERROR("convexHull require args: (Var, |bool, bool)"); } static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) { PyObject *points; - if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) { - auto rect = CV::minAreaRect(toPoints(points)); + if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) { + auto rect = CV::minAreaRect(toVar(points)); PyObject* center = PyTuple_New(2); PyTuple_SetItem(center, 0, toPyObj(rect.center.x)); PyTuple_SetItem(center, 1, toPyObj(rect.center.y)); @@ -472,16 +493,16 @@ static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) { PyTuple_SetItem(obj, 2, toPyObj(rect.angle)); return obj; } - PyMNN_ERROR("minAreaRect require args: ([float])"); + PyMNN_ERROR("minAreaRect require args: (Var)"); } static PyObject* PyMNNCV_boundingRect(PyObject *self, PyObject *args) { PyObject *points; - if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) { - auto rect = CV::boundingRect(toPoints(points)); + if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) { + auto rect = CV::boundingRect(toVar(points)); std::vector res { rect.x, rect.y, rect.width, rect.height }; return toPyObj(res); } - PyMNN_ERROR("boundingRect require args: ([float])"); + PyMNN_ERROR("boundingRect require args: (Var)"); } static PyObject* PyMNNCV_connectedComponentsWithStats(PyObject *self, PyObject *args) { PyObject *image; @@ -518,17 +539,106 @@ static PyObject* PyMNNCV_boxPoints(PyObject *self, PyObject *args) { error_: PyMNN_ERROR("boxPoints require args: [(float, (float, float), (float, float))])"); } +// draw +static bool isColor(PyObject* obj) { + return (isInts(obj) && (toInts(obj).size() == 3 || toInts(obj).size() == 4)); +} +CV::Scalar toColor(PyObject* obj) { + auto vals = toInts(obj); + if (vals.size() == 3) { + return CV::Scalar(vals[0], vals[1], vals[2]); + } + if (vals.size() == 4) { + return CV::Scalar(vals[0], vals[1], vals[2], vals[3]); + } + return CV::Scalar(255, 255, 255); +} +static PyObject* PyMNNCV_line(PyObject *self, PyObject *args) { + PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8); + int thickness = 1, shift = 0; + if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift) + && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) { + auto image = toVar(img); + CV::line(image, toPoint(pt1), toPoint(pt2), toColor(color), + thickness, toEnum(linetype), shift); + Py_RETURN_NONE; + } + PyMNN_ERROR("line require args: (Var, Point, Point, Color, |int, LineType, int)"); +} +static PyObject* PyMNNCV_arrowedLine(PyObject *self, PyObject *args) { + PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8); + int thickness = 1, shift = 0; + float tipLength = 0.1; + if (PyArg_ParseTuple(args, "OOOO|iOif", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift, &tipLength) + && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) { + auto image = toVar(img); + CV::arrowedLine(image, toPoint(pt1), toPoint(pt2), toColor(color), + thickness, toEnum(linetype), shift, tipLength); + Py_RETURN_NONE; + } + PyMNN_ERROR("arrowedLine require args: (Var, Point, Point, Color, |int, LineType, int, float)"); +} +static PyObject* PyMNNCV_circle(PyObject *self, PyObject *args) { + PyObject *img, *center, *color, *linetype = toPyObj(CV::LINE_8); + int radius, thickness = 1, shift = 0; + if (PyArg_ParseTuple(args, "OOiO|iOi", &img, ¢er, &radius, &color, &thickness, &linetype, &shift) + && isVar(img) && isPoint(center) && isColor(color) && isLineTypes(linetype)) { + auto image = toVar(img); + CV::circle(image, toPoint(center), radius, toColor(color), + thickness, toEnum(linetype), shift); + Py_RETURN_NONE; + } + PyMNN_ERROR("circle require args: (Var, Point, int, Color, |int, LineType, int)"); +} +static PyObject* PyMNNCV_rectangle(PyObject *self, PyObject *args) { + PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8); + int thickness = 1, shift = 0; + if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift) + && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) { + auto image = toVar(img); + CV::rectangle(image, toPoint(pt1), toPoint(pt2), toColor(color), + thickness, toEnum(linetype), shift); + Py_RETURN_NONE; + } + PyMNN_ERROR("rectangle require args: (Var, Point, Point, Color, |int, LineType, int)"); +} +static PyObject* PyMNNCV_drawContours(PyObject *self, PyObject *args) { + PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8); + int contourIdx, thickness = 1; + if (PyArg_ParseTuple(args, "OOiO|iO", &img, &contours, &contourIdx, &color, &thickness, &linetype) + && isVar(img) && isVec(contours) && isColor(color) && isLineTypes(linetype)) { + auto image = toVar(img); + CV::drawContours(image, toVec, toPoints>(contours), contourIdx, toColor(color), + thickness, toEnum(linetype)); + Py_RETURN_NONE; + } + PyMNN_ERROR("drawContours require args: (Var, [Points], int, Color, |int, LineType)"); +} +static PyObject* PyMNNCV_fillPoly(PyObject *self, PyObject *args) { + PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8), *offset = toPyObj(std::vector{0, 0}); + int shift = 0; + if (PyArg_ParseTuple(args, "OOO|OiO", &img, &contours, &color, &linetype, &shift, &offset) + && isVar(img) && isVec(contours) && isColor(color) && isLineTypes(linetype) && isPoint(offset)) { + auto image = toVar(img); + CV::fillPoly(image, toVec, toPoints>(contours), toColor(color), + toEnum(linetype), shift, toPoint(offset)); + Py_RETURN_NONE; + } + PyMNN_ERROR("fillPoly require args: (Var, [Points], Color, |LineType, int, Point)"); +} static PyMethodDef PyMNNCV_methods[] = { - register_methods(CV, #ifdef PYMNN_IMGCODECS + register_methods(CV, // imgcodecs haveImageReader, "haveImageReader", haveImageWriter, "haveImageWriter", imdecode, "imdecode", imencode, "imencode", imread, "imread", - imwrite, "imwrite", + imwrite, "imwrite" + ) #endif + register_methods(CV, // color cvtColor, "cvtColor.", cvtColorTwoPlane, "cvtColorTwoPlane.", @@ -569,6 +679,13 @@ static PyMethodDef PyMNNCV_methods[] = { minAreaRect, "minAreaRect", boundingRect, "boundingRect", connectedComponentsWithStats, "connectedComponentsWithStats", - boxPoints, "boxPoints" + boxPoints, "boxPoints", + // draw + line, "line", + arrowedLine, "arrowedLine", + circle, "circle", + rectangle, "rectangle", + drawContours, "drawContours", + fillPoly, "fillPoly" ) }; diff --git a/pymnn/src/expr.h b/pymnn/src/expr.h index 638b12da..fe11e3c9 100644 --- a/pymnn/src/expr.h +++ b/pymnn/src/expr.h @@ -63,6 +63,7 @@ def_enum(PrecisionMode, PrecisionMode, typedef struct { PyObject_HEAD VARP* var; + int iter_index; } PyMNNVar; static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static void PyMNNVar_dealloc(PyMNNVar *self); @@ -137,6 +138,9 @@ static PyObject* PyMNNVar_negative(PyObject*); static PyObject* PyMNNVar_absolute(PyObject*); static Py_ssize_t PyMNNVar_length(PyObject*); static PyObject* PyMNNVar_subscript(PyObject*, PyObject*); +static int PyMNNVar_ass_subscript(PyObject*, PyObject*, PyObject*); +static PyObject* PyMNNVar_iter(PyObject*); +static PyObject* PyMNNVar_iternext(PyObject*); #if PY_MAJOR_VERSION >= 3 static PyNumberMethods PyMNNVar_as_number = { PyMNNVar_add, /*nb_add*/ @@ -220,9 +224,9 @@ static PyNumberMethods PyMNNVar_as_number = { }; #endif static PyMappingMethods PyMNNVar_as_mapping = { - PyMNNVar_length, /*mp_length*/ - PyMNNVar_subscript, /*mp_subscript*/ - 0, /*mp_ass_subscript*/ + PyMNNVar_length, /*mp_length*/ + PyMNNVar_subscript, /*mp_subscript*/ + PyMNNVar_ass_subscript, /*mp_ass_subscript*/ }; PyObject *PyMNNVar_richcompare(PyObject *self, PyObject *other, int op); static PyTypeObject PyMNNVarType = { @@ -256,8 +260,8 @@ static PyTypeObject PyMNNVarType = { 0, /*tp_clear*/ &PyMNNVar_richcompare, /*tp_richcompare*/ 0, /*tp_weaklistoffset*/ - 0, /*tp_iter*/ - 0, /*tp_iternext*/ + &PyMNNVar_iter, /*tp_iter*/ + &PyMNNVar_iternext, /*tp_iternext*/ PyMNNVar_methods, /*tp_methods*/ 0, /*tp_members*/ PyMNNVar_getsetters, /*tp_getset*/ @@ -272,7 +276,7 @@ static PyTypeObject PyMNNVarType = { }; // helper functions static PyMNNVar* getVar() { - PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)&PyMNNVarType, PyTuple_New(0), NULL); + PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNVarType), PyTuple_New(0), NULL); var->var = new VARP; return var; } @@ -284,7 +288,7 @@ static PyObject* toPyObj(VARP var) { static bool isVar(PyObject* var) { return isInt(var) || isInts(var) || isFloat(var) || isFloats(var) || - PyObject_IsInstance(var, (PyObject*)&PyMNNVarType); + Py_TYPE(var) == PyType_FindTLSType(&PyMNNVarType); } static bool isVars(PyObject* var) { return isVec(var); @@ -353,21 +357,30 @@ std::pair toVarPair(PyObject* l, PyObject* r, bool fp = false) { PyObject *PyMNNVar_richcompare(PyObject *l, PyObject *r, int op) { auto lr = toVarPair(l, r); auto vl = lr.first, vr = lr.second; + VARP res; switch (op) { case Py_LT: - return toPyObj(Express::_Less(vl, vr)); + res = Express::_Less(vl, vr); + break; case Py_LE: - return toPyObj(Express::_LessEqual(vl, vr)); + res = Express::_LessEqual(vl, vr); + break; case Py_EQ: - return toPyObj(Express::_Equal(vl, vr)); + res = Express::_Equal(vl, vr); + break; case Py_NE: - return toPyObj(Express::_NotEqual(vl, vr)); + res = Express::_NotEqual(vl, vr); + break; case Py_GT: - return toPyObj(Express::_Greater(vl, vr)); + res = Express::_Greater(vl, vr); + break; case Py_GE: - return toPyObj(Express::_GreaterEqual(vl, vr)); + res = Express::_GreaterEqual(vl, vr); + break; + default: + Py_RETURN_NONE; } - Py_RETURN_NONE; + return toPyObj(res); } static PyObject* PyMNNVar_add(PyObject* l, PyObject* r) { auto lr = toVarPair(l, r); @@ -413,11 +426,10 @@ static Py_ssize_t PyMNNVar_length(PyObject* x) { } return size; } -static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) { - std::vector begin, end, strides; - int new_axis_mask = 0, shrink_axis_mask = 0, - begin_mask = 0, end_mask = 0, - ellipsis_mask = 0, index = 0; + +static void dealSlice(PyObject* slice, std::vector& begin, std::vector& end, std::vector& strides, + int& new_axis_mask, int& shrink_axis_mask, int& begin_mask, int& end_mask, int& ellipsis_mask) { + int index = 0; auto dealItem = [&](PyObject* item) { if (PySlice_Check(item)) { Py_ssize_t startl = 0, stopl = 0, stepl = 1; @@ -437,7 +449,7 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) { if ((step == 1 && start == 0) || (step == -1 && start == -1)) { begin_mask |= (1 << index); } - if ((step == 1 && stop == -1) || (step == -1 && stop == 0)) { + if ((step == 1 && stop == -1) || (step == -1 && stop == 0) || PY_SSIZE_T_MAX == stopl) { end_mask |= (1 << index); } } @@ -471,16 +483,136 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) { } else { dealItem(slice); } +} +static inline bool isIdx(PyObject* slice) { + return Py_TYPE(slice) == PyType_FindTLSType(&PyMNNVarType) || (PyList_Check(slice) && isInts(slice)); +} +static bool isBoolIdx(VARP idx, int reqSize) { + auto size = idx->getInfo()->size; + bool isbool = (size == reqSize); + if (isbool) { + auto ptr = idx->readMap(); + for (int i = 0; i < size; i++) { + if (ptr[i] != 0 && ptr[i] != 1) { + return false; + } + } + } + return isbool; +} +static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) { + // gather: 1. 0-1 gather; 2. idx gather; + if (isIdx(slice)) { + auto val = toVar(x); + auto idx = toVar(slice); + if (val->getInfo()->size > 1 && isBoolIdx(idx, val->getInfo()->size)) { + // 0-1 gather -> idx gather + idx = Express::_Where(idx); + val = Express::_GatherND(val, idx); + val = Express::_Reshape(val, {-1}); + return toPyObj(val); + } + auto r = Express::_Gather(val, idx); + r->readMap(); + return toPyObj(r); + } + + std::vector begin, end, strides; + int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0; + dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask); int size_ = static_cast(begin.size()); auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of()); auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of()); auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of()); - return toPyObj(Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask, - ellipsis_mask, new_axis_mask, shrink_axis_mask)); + auto res = Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask, + ellipsis_mask, new_axis_mask, shrink_axis_mask); + auto info = res->getInfo(); + if (!info) { + PyMNN_ERROR("subscript: unable to get variable info"); + } + // to scalar + if (info->dim.empty()) { + auto dtype = info->type; + if (dtype == halide_type_of()) { + return toPyObj(res->readMap()[0]); + } + if (dtype == halide_type_of()) { + return toPyObj(res->readMap()[0]); + } + if (dtype == halide_type_of()) { + return toPyObj(res->readMap()[0]); + } + if (dtype == halide_type_of()) { + return toPyObj((float)res->readMap()[0]); + } + } + return toPyObj(res); +} + +static int PyMNNVar_ass_subscript(PyObject* x, PyObject* slice, PyObject* y) { + if (!isVar(x) || !isVar(y)) { + PyMNN_ERROR_LOG("ass_subscript require args: (Var, int/Var, int/float/Var)"); + return -1; + } + auto var = toVar(x); + auto val = toVar(y); + auto varInfo = var->getInfo(); + if (isIdx(slice)) { + auto idx = toVar(slice); + if (isBoolIdx(idx, varInfo->size)) { + idx = Express::_Where(idx); + } + auto idxDim = idx->getInfo()->dim; + int scatterNum = idxDim[0], scatterDim = 1; + if (idxDim.size() < 2) { + idx = Express::_Unsqueeze(idx, {-1}); + } else { + scatterDim = idxDim[1]; + } + // val broadcast_to [scatterNum, (scatterDim < varDim.size() ? varDim[scatterDim:] : 1)] + auto varDim = varInfo->dim; + std::vector valDim(1, scatterNum); + if (scatterDim >= varDim.size()) { + valDim.push_back(1); + } else { + for (int i = scatterDim; i < varDim.size(); i++) { + valDim.push_back(varDim[i]); + } + } + val = Express::_BroadcastTo(val, _Const(valDim.data(), {static_cast(valDim.size())}, NCHW, halide_type_of())); + *(((PyMNNVar*)x)->var) = Express::_ScatterNd(idx, val, Express::_Shape(var), var); + return 0; + } + std::vector begin, end, strides; + int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0; + dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask); + int size_ = static_cast(begin.size()); + auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of()); + auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of()); + auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of()); + *(((PyMNNVar*)x)->var) = Express::_StridedSliceWrite(var, begin_, end_, strides_, val, begin_mask, end_mask, + ellipsis_mask, new_axis_mask, shrink_axis_mask); + return 0; +} +static PyObject* PyMNNVar_iter(PyObject *self) { + auto var = toVar(self); + if (var->getInfo()->dim.empty()) { + PyMNN_ERROR("iteration over a 0-d array"); + } + Py_INCREF(self); + return self; +} +static PyObject* PyMNNVar_iternext(PyObject *self) { + auto idx = ((PyMNNVar*)self)->iter_index++; + auto var = toVar(self); + auto conut = var->getInfo()->dim[0]; + if (idx >= conut) return NULL; + return toPyObj(Express::_Gather(var, Express::_Scalar(idx))); } // PyMNNVar basic functions impl static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyMNNVar* self = (PyMNNVar *)type->tp_alloc(type, 0); + self->iter_index = 0; self->var = nullptr; return (PyObject*)self; } @@ -505,7 +637,7 @@ static PyObject* PyMNNVar_getshape(PyMNNVar *self, void *closure) { if (self->var) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("getshape: unable to get variable info"); } shape = toPyObj(info->dim); } @@ -524,7 +656,7 @@ static PyObject* PyMNNVar_getdata_format(PyMNNVar *self, void *closure) { if (self->var) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("getdata_format: unable to get variable info"); } return toPyObj(info->order); } @@ -534,7 +666,7 @@ static PyObject* PyMNNVar_getdtype(PyMNNVar *self, void *closure) { if (self->var) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("getdtype: unable to get variable info"); } return toPyObj(htype2dtype(info->type)); } @@ -544,7 +676,7 @@ static PyObject* PyMNNVar_getsize(PyMNNVar *self, void *closure) { if (self->var) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("getsize: unable to get variable info"); } return toPyObj(info->size); } @@ -564,7 +696,7 @@ PyObject *ndim = NULL; if (self->var) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("getndim: unable to get variable info"); } ndim = toPyObj((int)info->dim.size()); } @@ -685,13 +817,16 @@ static PyObject* PyMNNVar_resize(PyMNNVar *self, PyObject *args) { static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("read: unable to get variable info"); } auto dtype = htype2dtype(info->type); auto shape = info->dim; int64_t total_length = info->size; auto readptr = [self](DType dtype, INTS shape, int64_t total_length) { void *dataPtr = (void *) (*(self->var))->readMap(); + if (nullptr == dataPtr) { + PyMNN_ERROR("call to readMap meet a error"); + } std::vector npy_dims; for(const auto dim : shape) { npy_dims.push_back(dim); @@ -710,9 +845,6 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) { default: PyMNN_ERROR("does not support this dtype"); } - if (nullptr == dataPtr) { - PyMNN_ERROR("call to readMap meet a error"); - } }; auto data = readptr(dtype, shape, total_length); (*(self->var))->unMap(); @@ -722,13 +854,16 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) { static PyObject* PyMNNVar_read_as_tuple(PyMNNVar *self, PyObject *args) { auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("read_as_tuple: unable to get variable info"); } auto dtype = htype2dtype(info->type); auto shape = info->dim; size_t total_length = info->size; auto readptr = [self](DType dtype, INTS shape, size_t total_length) { void *dataPtr = (void *) (*(self->var))->readMap(); + if (nullptr == dataPtr) { + PyMNN_ERROR("call to readMap meet a error"); + } auto obj = PyTuple_New(total_length); if(DType_FLOAT == dtype) { auto data = (float*)dataPtr; @@ -766,7 +901,7 @@ static PyObject* PyMNNVar_write(PyMNNVar *self, PyObject *args) { } auto info = (*(self->var))->getInfo(); if(nullptr == info) { - PyMNN_ERROR("unable to get variable info"); + PyMNN_ERROR("write: unable to get variable info"); } auto dtype = htype2dtype(info->type); int64_t total_length = info->size; @@ -1042,11 +1177,15 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg total_length *= shape[i]; } } - auto data = toPtr(value, dtype, total_length); auto ret = getVar(); - if(data) { - *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype)); - free(data); + if (total_length > 0) { + auto data = toPtr(value, dtype, total_length); + if(data) { + *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype)); + free(data); + } + } else { + *(ret->var) = _Const(nullptr, shape, data_format, dtype2htype(dtype)); } return (PyObject *)ret; } @@ -1332,6 +1471,32 @@ static PyObject* PyMNNExpr_randomuniform(PyObject *self, PyObject *args) { } PyMNN_ERROR("randomuniform require args: (Var, dtype, |float, float, int, int)"); } +static PyObject* PyMNNExpr_sort(PyObject *self, PyObject *args) { + PyObject *x; + int axis = -1, arg = 0, descend = 0, bykey = -1; + if (PyArg_ParseTuple(args, "O|iii", &x, &axis, &arg, &descend) && isVar(x)) { + return toPyObj(Express::_Sort(toVar(x), axis, arg, descend)); + } + PyMNN_ERROR("sort require args: (Var, |int, bool, bool)"); +} +static PyObject* PyMNNExpr_raster(PyObject *self, PyObject *args) { + PyObject *var, *region, *shape; + if (PyArg_ParseTuple(args, "OOO", &var, ®ion, &shape) && + isVars(var) && isInts(region) && isInts(shape)) { + return toPyObj(Express::_Raster(toVars(var), toInts(region), toInts(shape))); + } + PyMNN_ERROR("raster require args: ([Var], [int], [int])"); +} +static PyObject* PyMNNExpr_nms(PyObject *self, PyObject *args) { + PyObject *boxes, *scores; + int max_detections; + float iou_threshold = -1.0, score_threshold = -1.0; + if (PyArg_ParseTuple(args, "OOi|ff", &boxes, &scores, &max_detections, &iou_threshold, &score_threshold) && + isVar(boxes) && isVar(scores)) { + return toPyObj(Express::_Nms(toVar(boxes), toVar(scores), max_detections, iou_threshold, score_threshold)); + } + PyMNN_ERROR("nms require args: (Var, Var, |float, float)"); +} static PyObject* PyMNNExpr_detection_post_process(PyObject *self, PyObject *args) { PyObject *encode_boxes, *class_predictions, *anchors, *centersize_encoding; int num_classes, max_detections, max_class_per_detection, detections_per_class; @@ -1508,6 +1673,9 @@ static PyMethodDef PyMNNExpr_methods[] = { zeros_like, "build zeros_like expr", unstack, "build unstack expr", range, "build range expr", + sort, "build sort expr", + raster, "build raster expr", + nms, "build nms expr", detection_post_process, "build detection_post_process expr" ) }; diff --git a/pymnn/src/nn.h b/pymnn/src/nn.h index 8efb45e4..1248c8d3 100644 --- a/pymnn/src/nn.h +++ b/pymnn/src/nn.h @@ -1,4 +1,10 @@ #include "util.h" +#ifdef PYMNN_INTERNAL_SERVING +#include +#include +#include "internal/monitor_service.h" +#include "internal/verify_service.h" +#endif // NN Module Start def_class_start(_Module, Module) @@ -19,6 +25,37 @@ def_class_methods(_Module, _add_parameter, "add parameter" ) def_class_end(_Module, Module) + +static PyObject* load_module(PyObject *inputs, PyObject *outputs, PyObject *backend, PyObject *memory_mode, + PyObject *power_mode, PyObject *precision_mode, const char* file_name, int dynamic, + int shape_mutable, int rearrange, int thread_num) { + + BackendConfig backend_config; + backend_config.memory = toEnum(memory_mode); + backend_config.power = toEnum(power_mode); + backend_config.precision = toEnum(precision_mode); + + Module::BackendInfo backend_info; + backend_info.type = toEnum(backend); + backend_info.config = &backend_config; + + Module::Config config; + config.dynamic = dynamic; + config.shapeMutable = shape_mutable; + config.rearrange = rearrange; + config.backend = &backend_info; + + auto converted_file_name = convertBytesEncodeIfNeed(file_name); + auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config); + if (m_ptr == nullptr) { + std::string mnn_errno = "load_module_from_file failed "; + mnn_errno = mnn_errno + std::string(file_name); + PyErr_SetString(PyExc_Exception, mnn_errno.c_str()); + } + + return toPyObj(m_ptr); +} + static PyObject* PyMNN_Module_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { PyMNN_Module *self = (PyMNN_Module *)type->tp_alloc(type, 0); self->ptr = Module::createEmpty({}); @@ -50,10 +87,31 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) { Py_RETURN_NONE; } if (isVars(input)) { +#ifdef PYMNN_INTERNAL_SERVING + int status = 0; + Timer timer; + auto vars = self->ptr->onForward(toVars(input)); + if (vars.empty()) { + PyMNN_ERROR("module onForward occur error."); + status = -1; + } + + (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward"); + return toPyObj(vars); +#else return toPyObj(self->ptr->onForward(toVars(input))); +#endif } if (isVar(input)) { +#ifdef PYMNN_INTERNAL_SERVING + int status = 0; + Timer timer; + auto var = self->ptr->forward(toVar(input)); + (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward"); + return toPyObj(var); +#else return toPyObj(self->ptr->forward(toVar(input))); +#endif } PyMNN_ERROR("PyMNN_Module_forward: args must be Var/[Var]."); } @@ -62,8 +120,22 @@ static PyObject* PyMNN_Module_onForward(PyMNN_Module *self, PyObject *args) { if (!PyArg_ParseTuple(args, "O", &inputs)) { Py_RETURN_NONE; } +#ifdef PYMNN_INTERNAL_SERVING + int status = 0; + Timer timer; + auto vars = self->ptr->onForward(toVars(inputs)); + if (vars.empty()) { + PyMNN_ERROR("module onForward occur error."); + status = -1; + } + + (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_onForward"); + return toPyObj(vars); +#else return toPyObj(self->ptr->onForward(toVars(inputs))); +#endif } + static PyObject* PyMNN_Module_set_name(PyMNN_Module *self, PyObject *args) { const char* name; if (!PyArg_ParseTuple(args, "s", &name)) { @@ -125,6 +197,11 @@ static PyObject* PyMNNNN_load_module(PyObject *self, PyObject *args) { return toPyObj(m); } static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) { +#ifdef PYMNN_INTERNAL_SERVING + PyErr_SetString(PyExc_Exception, + "PyMNNNN_load_module_from_file: unsupported interface, should use load_module_from_file_with_token."); + return NULL; +#endif PyObject *inputs, *outputs, *backend, *memory_mode, *power_mode, *precision_mode; const char* file_name; int dynamic, shape_mutable, rearrange; @@ -135,30 +212,54 @@ static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) { printf("PyArg_ParseTuple Error\n"); return NULL; } - BackendConfig backend_config; - backend_config.memory = toEnum(memory_mode); - backend_config.power = toEnum(power_mode); - backend_config.precision = toEnum(precision_mode); - Module::BackendInfo backend_info; - backend_info.type = toEnum(backend); - backend_info.config = &backend_config; - - Module::Config config; - config.dynamic = dynamic; - config.shapeMutable = shape_mutable; - config.rearrange = rearrange; - config.backend = &backend_info; - - auto converted_file_name = convertBytesEncodeIfNeed(file_name); - auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config); - if (m_ptr == nullptr) { - std::string mnn_errno = "load_module_from_file failed "; - mnn_errno = mnn_errno + std::string(file_name); - PyErr_SetString(PyExc_Exception, mnn_errno.c_str()); - } - return toPyObj(m_ptr); + return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic, + shape_mutable, rearrange, thread_num); } + +#ifdef PYMNN_INTERNAL_SERVING +static PyObject* PyMNNNN_load_module_from_file_with_token(PyObject *self, PyObject *args) { + PyObject *inputs, *outputs; + const char* file_name; + PyObject *backend = toPyObj(MNN_FORWARD_CPU); + PyObject *memory_mode = toPyObj(MemoryMode::Memory_Normal); + PyObject *power_mode = toPyObj(PowerMode::Power_Normal);; + PyObject *precision_mode = toPyObj(PrecisionMode::Precision_Normal);; + int dynamic = 0; + int shape_mutable = 0; + int rearrange = 0; + char *token = NULL; + char *scene = NULL; + char *app_key = NULL; + int thread_num = 1; + if (!PyArg_ParseTuple(args, "OOssss|iiiOOOOi", &inputs, &outputs, &file_name, &token, &scene, &app_key, &dynamic, + &shape_mutable, &rearrange, &backend, &memory_mode, &power_mode, &precision_mode, + &thread_num)) { + printf("PyArg_ParseTuple Error\n"); + return NULL; + } + + if (!token || !scene || !app_key) { + PyErr_SetString(PyExc_Exception, + "PyMNNNN_load_module_from_file_with_token: input invalid, token, scene or app_key is null."); + return NULL; + } + + MonitorService::GetInstance().Start(); + VerifyService::GetInstance().Start(); + bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key)); + if (!ret) { + PyErr_SetString(PyExc_Exception, + "PyMNNNN_load_module_from_file_with_token: check token failed, return null module."); + return NULL; + } + + return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic, + shape_mutable, rearrange, thread_num); + +} +#endif + #ifdef PYMNN_TRAIN_API static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) { INTS default_1 = {1, 1}, default_0 = {0, 0}; @@ -221,10 +322,18 @@ static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args) { } #endif static PyMethodDef PyMNNNN_methods[] = { +#ifdef PYMNN_INTERNAL_SERVING + register_methods(NN, + load_module, "load_module([Var], [Var], bool)", + load_module_from_file_with_token, "load_module_from_file_with_token([string], [string], filename, bool, ...)", + load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)" + ) +#else register_methods(NN, load_module, "load_module([Var], [Var], bool)", load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)" ) +#endif #ifdef PYMNN_TRAIN_API register_methods(NN, conv, "conv Module", @@ -234,4 +343,4 @@ static PyMethodDef PyMNNNN_methods[] = { ) #endif }; -// NN Module End \ No newline at end of file +// NN Module End diff --git a/pymnn/src/util.h b/pymnn/src/util.h index c79db6db..5e3594ff 100644 --- a/pymnn/src/util.h +++ b/pymnn/src/util.h @@ -225,13 +225,16 @@ inline int getnpysize(int npy_type) { return 4; case NPY_DOUBLE: return 8; - case NPY_INT: - return 4; case NPY_INT64: return 8; case NPY_UINT8: return 1; default: + // NPY_INT(np.int) and NPY_INT32(np.int32) may be different enum on some platform + // use `if` instead of `switch case`(when NPY_INT is same as NPY_INT32, two same case value is not support) + if (npy_type == NPY_INT || npy_type == NPY_INT32) { + return 4; + } PyMNN_ERROR_LOG("does not support this npy_type"); return 0; } @@ -249,7 +252,7 @@ inline int getitemsize(int dtype, int npy_type) { } return 8; case DType_INT32: - if(npy_type != NPY_INT) { + if(npy_type != NPY_INT && npy_type != NPY_INT32) { PyMNN_ERROR_LOG("numpy type does not match"); } return 4; @@ -383,7 +386,7 @@ static bool isVec(PyObject* obj) { return Func(PyList_GetItem(obj, 0)); } else return true; } - return false; + return Func(obj); } static inline bool isInts(PyObject* obj) { return isInt(obj) || isVec(obj); @@ -438,6 +441,7 @@ static vector toVec(PyObject* obj) { } return values; } + values.push_back(Func(obj)); return values; } static inline std::vector toInts(PyObject* obj) { @@ -586,188 +590,185 @@ static void* toPtr(PyObject *obj, DType dtype, int64_t& total_length, void* data // just support COND = 0 or 1 #define arg_if(COND, THEN, ELSE) arg_concat(arg_if_, COND)(THEN, ELSE) #define expand_item_0(...) -#define expand_item_1(macro, context, key, value, ITEMS...) \ +#define expand_item_1(macro, context, key, value, ...) \ macro(context, key, value) -#define expand_item_2(macro, context, key, value, ITEMS...) \ +#define expand_item_2(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_1(macro, context, ITEMS) -#define expand_item_3(macro, context, key, value, ITEMS...) \ + expand_item_1(macro, context, __VA_ARGS__) +#define expand_item_3(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_2(macro, context, ITEMS) -#define expand_item_4(macro, context, key, value, ITEMS...) \ + expand_item_2(macro, context, __VA_ARGS__) +#define expand_item_4(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_3(macro, context, ITEMS) -#define expand_item_5(macro, context, key, value, ITEMS...) \ + expand_item_3(macro, context, __VA_ARGS__) +#define expand_item_5(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_4(macro, context, ITEMS) -#define expand_item_6(macro, context, key, value, ITEMS...) \ + expand_item_4(macro, context, __VA_ARGS__) +#define expand_item_6(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_5(macro, context, ITEMS) -#define expand_item_7(macro, context, key, value, ITEMS...) \ + expand_item_5(macro, context, __VA_ARGS__) +#define expand_item_7(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_6(macro, context, ITEMS) -#define expand_item_8(macro, context, key, value, ITEMS...) \ + expand_item_6(macro, context, __VA_ARGS__) +#define expand_item_8(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_7(macro, context, ITEMS) -#define expand_item_9(macro, context, key, value, ITEMS...) \ + expand_item_7(macro, context, __VA_ARGS__) +#define expand_item_9(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_8(macro, context, ITEMS) -#define expand_item_10(macro, context, key, value, ITEMS...) \ + expand_item_8(macro, context, __VA_ARGS__) +#define expand_item_10(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_9(macro, context, ITEMS) -#define expand_item_11(macro, context, key, value, ITEMS...) \ + expand_item_9(macro, context, __VA_ARGS__) +#define expand_item_11(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_10(macro, context, ITEMS) -#define expand_item_12(macro, context, key, value, ITEMS...) \ + expand_item_10(macro, context, __VA_ARGS__) +#define expand_item_12(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_11(macro, context, ITEMS) -#define expand_item_13(macro, context, key, value, ITEMS...) \ + expand_item_11(macro, context, __VA_ARGS__) +#define expand_item_13(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_12(macro, context, ITEMS) -#define expand_item_14(macro, context, key, value, ITEMS...) \ + expand_item_12(macro, context, __VA_ARGS__) +#define expand_item_14(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_13(macro, context, ITEMS) -#define expand_item_15(macro, context, key, value, ITEMS...) \ + expand_item_13(macro, context, __VA_ARGS__) +#define expand_item_15(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_14(macro, context, ITEMS) -#define expand_item_16(macro, context, key, value, ITEMS...) \ + expand_item_14(macro, context, __VA_ARGS__) +#define expand_item_16(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_15(macro, context, ITEMS) -#define expand_item_17(macro, context, key, value, ITEMS...) \ + expand_item_15(macro, context, __VA_ARGS__) +#define expand_item_17(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_16(macro, context, ITEMS) -#define expand_item_18(macro, context, key, value, ITEMS...) \ + expand_item_16(macro, context, __VA_ARGS__) +#define expand_item_18(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_17(macro, context, ITEMS) -#define expand_item_19(macro, context, key, value, ITEMS...) \ + expand_item_17(macro, context, __VA_ARGS__) +#define expand_item_19(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_18(macro, context, ITEMS) -#define expand_item_20(macro, context, key, value, ITEMS...) \ + expand_item_18(macro, context, __VA_ARGS__) +#define expand_item_20(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_19(macro, context, ITEMS) -#define expand_item_21(macro, context, key, value, ITEMS...) \ + expand_item_19(macro, context, __VA_ARGS__) +#define expand_item_21(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_20(macro, context, ITEMS) -#define expand_item_22(macro, context, key, value, ITEMS...) \ + expand_item_20(macro, context, __VA_ARGS__) +#define expand_item_22(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_21(macro, context, ITEMS) -#define expand_item_23(macro, context, key, value, ITEMS...) \ + expand_item_21(macro, context, __VA_ARGS__) +#define expand_item_23(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_22(macro, context, ITEMS) -#define expand_item_24(macro, context, key, value, ITEMS...) \ + expand_item_22(macro, context, __VA_ARGS__) +#define expand_item_24(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_23(macro, context, ITEMS) -#define expand_item_24(macro, context, key, value, ITEMS...) \ + expand_item_23(macro, context, __VA_ARGS__) +#define expand_item_25(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_23(macro, context, ITEMS) -#define expand_item_25(macro, context, key, value, ITEMS...) \ + expand_item_24(macro, context, __VA_ARGS__) +#define expand_item_26(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_24(macro, context, ITEMS) -#define expand_item_26(macro, context, key, value, ITEMS...) \ + expand_item_25(macro, context, __VA_ARGS__) +#define expand_item_27(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_25(macro, context, ITEMS) -#define expand_item_27(macro, context, key, value, ITEMS...) \ + expand_item_26(macro, context, __VA_ARGS__) +#define expand_item_28(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_26(macro, context, ITEMS) -#define expand_item_28(macro, context, key, value, ITEMS...) \ + expand_item_27(macro, context, __VA_ARGS__) +#define expand_item_29(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_27(macro, context, ITEMS) -#define expand_item_29(macro, context, key, value, ITEMS...) \ + expand_item_28(macro, context, __VA_ARGS__) +#define expand_item_30(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_28(macro, context, ITEMS) -#define expand_item_30(macro, context, key, value, ITEMS...) \ + expand_item_29(macro, context, __VA_ARGS__) +#define expand_item_31(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_29(macro, context, ITEMS) -#define expand_item_31(macro, context, key, value, ITEMS...) \ + expand_item_30(macro, context, __VA_ARGS__) +#define expand_item_32(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_30(macro, context, ITEMS) -#define expand_item_32(macro, context, key, value, ITEMS...) \ + expand_item_31(macro, context, __VA_ARGS__) +#define expand_item_33(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_31(macro, context, ITEMS) -#define expand_item_33(macro, context, key, value, ITEMS...) \ + expand_item_32(macro, context, __VA_ARGS__) +#define expand_item_34(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_32(macro, context, ITEMS) -#define expand_item_34(macro, context, key, value, ITEMS...) \ + expand_item_33(macro, context, __VA_ARGS__) +#define expand_item_35(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_33(macro, context, ITEMS) -#define expand_item_35(macro, context, key, value, ITEMS...) \ + expand_item_34(macro, context, __VA_ARGS__) +#define expand_item_36(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_34(macro, context, ITEMS) -#define expand_item_36(macro, context, key, value, ITEMS...) \ + expand_item_35(macro, context, __VA_ARGS__) +#define expand_item_37(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_35(macro, context, ITEMS) -#define expand_item_37(macro, context, key, value, ITEMS...) \ + expand_item_36(macro, context, __VA_ARGS__) +#define expand_item_38(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_36(macro, context, ITEMS) -#define expand_item_38(macro, context, key, value, ITEMS...) \ + expand_item_37(macro, context, __VA_ARGS__) +#define expand_item_39(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_37(macro, context, ITEMS) -#define expand_item_39(macro, context, key, value, ITEMS...) \ + expand_item_38(macro, context, __VA_ARGS__) +#define expand_item_40(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_38(macro, context, ITEMS) -#define expand_item_40(macro, context, key, value, ITEMS...) \ + expand_item_39(macro, context, __VA_ARGS__) +#define expand_item_41(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_39(macro, context, ITEMS) -#define expand_item_41(macro, context, key, value, ITEMS...) \ + expand_item_40(macro, context, __VA_ARGS__) +#define expand_item_42(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_40(macro, context, ITEMS) -#define expand_item_42(macro, context, key, value, ITEMS...) \ + expand_item_41(macro, context, __VA_ARGS__) +#define expand_item_43(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_41(macro, context, ITEMS) -#define expand_item_43(macro, context, key, value, ITEMS...) \ + expand_item_42(macro, context, __VA_ARGS__) +#define expand_item_44(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_42(macro, context, ITEMS) -#define expand_item_44(macro, context, key, value, ITEMS...) \ + expand_item_43(macro, context, __VA_ARGS__) +#define expand_item_45(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_43(macro, context, ITEMS) -#define expand_item_45(macro, context, key, value, ITEMS...) \ + expand_item_44(macro, context, __VA_ARGS__) +#define expand_item_46(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_44(macro, context, ITEMS) -#define expand_item_46(macro, context, key, value, ITEMS...) \ + expand_item_45(macro, context, __VA_ARGS__) +#define expand_item_47(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_45(macro, context, ITEMS) -#define expand_item_47(macro, context, key, value, ITEMS...) \ + expand_item_46(macro, context, __VA_ARGS__) +#define expand_item_48(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_46(macro, context, ITEMS) -#define expand_item_48(macro, context, key, value, ITEMS...) \ + expand_item_47(macro, context, __VA_ARGS__) +#define expand_item_49(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_47(macro, context, ITEMS) -#define expand_item_49(macro, context, key, value, ITEMS...) \ + expand_item_48(macro, context, __VA_ARGS__) +#define expand_item_50(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_48(macro, context, ITEMS) -#define expand_item_50(macro, context, key, value, ITEMS...) \ + expand_item_49(macro, context, __VA_ARGS__) +#define expand_item_51(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_49(macro, context, ITEMS) -#define expand_item_51(macro, context, key, value, ITEMS...) \ + expand_item_50(macro, context, __VA_ARGS__) +#define expand_item_52(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_50(macro, context, ITEMS) -#define expand_item_52(macro, context, key, value, ITEMS...) \ + expand_item_51(macro, context, __VA_ARGS__) +#define expand_item_53(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_51(macro, context, ITEMS) -#define expand_item_53(macro, context, key, value, ITEMS...) \ + expand_item_52(macro, context, __VA_ARGS__) +#define expand_item_54(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_52(macro, context, ITEMS) -#define expand_item_54(macro, context, key, value, ITEMS...) \ + expand_item_53(macro, context, __VA_ARGS__) +#define expand_item_55(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_53(macro, context, ITEMS) -#define expand_item_55(macro, context, key, value, ITEMS...) \ + expand_item_54(macro, context, __VA_ARGS__) +#define expand_item_56(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_54(macro, context, ITEMS) -#define expand_item_56(macro, context, key, value, ITEMS...) \ + expand_item_55(macro, context, __VA_ARGS__) +#define expand_item_57(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_55(macro, context, ITEMS) -#define expand_item_57(macro, context, key, value, ITEMS...) \ + expand_item_56(macro, context, __VA_ARGS__) +#define expand_item_58(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_56(macro, context, ITEMS) -#define expand_item_58(macro, context, key, value, ITEMS...) \ + expand_item_57(macro, context, __VA_ARGS__) +#define expand_item_59(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_57(macro, context, ITEMS) -#define expand_item_59(macro, context, key, value, ITEMS...) \ + expand_item_58(macro, context, __VA_ARGS__) +#define expand_item_60(macro, context, key, value, ...) \ macro(context, key, value) \ - expand_item_58(macro, context, ITEMS) -#define expand_item_60(macro, context, key, value, ITEMS...) \ - macro(context, key, value) \ - expand_item_59(macro, context, ITEMS) + expand_item_59(macro, context, __VA_ARGS__) #define expand_items(macro, context, ...) \ arg_concat(expand_item_, arg_half_size(__VA_ARGS__))(macro, context, __VA_ARGS__) //------------------------ macro_utils end ------------------------- @@ -790,18 +791,6 @@ static PyObject* PyEnum_new(struct _typeobject *type, PyObject *args, PyObject * Py_hash_t PyEnum_hash(PyObject* x) { return static_cast(((PyMNNEnum*)x)->value); } -PyObject *PyEnum_richcompare(PyObject *self, PyObject *other, int op) { - int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value; - switch (op) { - case Py_LT: return toPyObj(l < r); - case Py_LE: return toPyObj(l <= r); - case Py_EQ: return toPyObj(l == r); - case Py_NE: return toPyObj(l != r); - case Py_GT: return toPyObj(l > r); - case Py_GE: return toPyObj(l >= r); - } - Py_RETURN_NONE; -} static PyObject* toPyEnum(PyObject* type, int val) { auto args = PyTuple_New(1); PyTuple_SetItem((PyObject*)args, 0, PyLong_FromLong((long)val)); @@ -825,11 +814,11 @@ static T toEnum(PyObject* e) { PyObject_SetAttrString(scope, value, toPyObj(key)); \ PyDict_SetItemString(dict, value, toPyObj(key)); -#define def_enum_repr(NAME, ITEMS...) \ +#define def_enum_repr(NAME, ...) \ static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \ std::string str = #NAME "."; \ std::map items = { \ - expand_items(declare_map_item, _, ITEMS) \ + expand_items(declare_map_item, _, __VA_ARGS__) \ }; \ int key = ((PyMNNEnum*)self)->value; \ auto iter = items.find(key); \ @@ -839,22 +828,23 @@ static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \ #define def_enum_to(NAME, TYPE) \ static PyObject* toPyObj(TYPE value) { \ - return toPyEnum((PyObject*)&PyEnum_##NAME, static_cast(value)); \ + return toPyEnum((PyObject*)PyType_FindTLSType(&PyEnum_##NAME), static_cast(value)); \ } -#define def_enum_register(NAME, ITEMS...) \ +#define def_enum_register(NAME, ...) \ static void def_##NAME(PyObject *scope) { \ - if (PyType_Ready(&PyEnum_##NAME) < 0) { \ + if (PyType_Ready(PyType_FindTLSType(&PyEnum_##NAME)) < 0) { \ PyErr_SetString(PyExc_Exception, "init " #NAME ": PyType_Ready failed"); \ } \ - PyObject* self = (PyObject *)&PyEnum_##NAME; \ + PyObject* self = (PyObject *)PyType_FindTLSType(&PyEnum_##NAME); \ PyObject* dict = PyEnum_##NAME.tp_dict; \ PyModule_AddObject(scope, #NAME, self); \ - expand_items(register_item, NAME, ITEMS) \ + expand_items(register_item, NAME, __VA_ARGS__) \ } -#define def_enum(NAME, TYPE, ITEMS...) \ -def_enum_repr(NAME, ITEMS) \ +#define def_enum(NAME, TYPE, ...) \ +def_enum_repr(NAME, __VA_ARGS__) \ +PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op); \ static PyTypeObject PyEnum_##NAME = { \ PyVarObject_HEAD_INIT(NULL, 0) \ #NAME, /*tp_name*/\ @@ -879,7 +869,7 @@ static PyTypeObject PyEnum_##NAME = { \ "PyMNNEnum", /*tp_doc*/\ 0, /*tp_traverse*/\ 0, /*tp_clear*/\ - &PyEnum_richcompare, /*tp_richcompare*/\ + &PyEnum_##NAME##richcompare, /*tp_richcompare*/\ 0, /*tp_weaklistoffset*/\ 0, /*tp_iter*/\ 0, /*tp_iternext*/\ @@ -895,9 +885,22 @@ static PyTypeObject PyEnum_##NAME = { \ 0, /*tp_alloc*/\ PyEnum_new /*tp_new*/\ };\ -static inline bool is##NAME(PyObject* obj) { return PyObject_IsInstance(obj, (PyObject*)&PyEnum_##NAME); } \ +static inline bool is##NAME(PyObject* obj) { return Py_TYPE(obj) == PyType_FindTLSType(&PyEnum_##NAME); } \ +PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op) { \ + if (!is##NAME(other)) Py_RETURN_FALSE; \ + int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value; \ + switch (op) { \ + case Py_LT: return toPyObj(l < r); \ + case Py_LE: return toPyObj(l <= r); \ + case Py_EQ: return toPyObj(l == r); \ + case Py_NE: return toPyObj(l != r); \ + case Py_GT: return toPyObj(l > r); \ + case Py_GE: return toPyObj(l >= r); \ + } \ + Py_RETURN_FALSE; \ +} \ def_enum_to(NAME, TYPE) \ -def_enum_register(NAME, ITEMS) +def_enum_register(NAME, __VA_ARGS__) // ------------------------ enum end -------------------------- // ------------------------ func start ------------------------ #define def_methods(MODULE, NAME) \ @@ -996,10 +999,10 @@ static PyObject* PyMNN##SCOPE##_##NAME(PyObject *self, PyObject *args) { \ #define def_class_register(NAME) \ static void def_##NAME(PyObject *scope) { \ - if (PyType_Ready(&PyMNN##NAME##Type) < 0) { \ + if (PyType_Ready(PyType_FindTLSType(&PyMNN##NAME##Type)) < 0) { \ PyErr_SetString(PyExc_Exception, "init" #NAME ": PyType_Ready PyMNN" #NAME "Type failed"); \ } \ - PyObject* self = (PyObject *)&PyMNN##NAME##Type; \ + PyObject* self = (PyObject *)PyType_FindTLSType(&PyMNN##NAME##Type); \ PyModule_AddObject(scope, #NAME, self); \ } @@ -1071,7 +1074,7 @@ static PyTypeObject PyMNN##NAME##Type = { \ };\ def_class_register(NAME) \ static PyMNN##NAME* get##NAME() { \ - return (PyMNN##NAME *)PyObject_Call((PyObject*)&PyMNN##NAME##Type, PyTuple_New(0), NULL); \ + return (PyMNN##NAME *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNN##NAME##Type), PyTuple_New(0), NULL); \ } \ static PyObject* toPyObj(TYPE* x) { \ auto ret = get##NAME(); \ diff --git a/pymnn/test/model_test.py b/pymnn/test/model_test.py index 78bd5e51..df939b8c 100644 --- a/pymnn/test/model_test.py +++ b/pymnn/test/model_test.py @@ -1,3 +1,4 @@ +# -*- coding: UTF-8 -*- import os import sys import MNN @@ -10,7 +11,11 @@ def parseConfig(root_dir): configName = os.path.join(root_dir, 'config.txt') if not os.path.exists(configName): return False - config = open(configName, 'rt') + try: + config = open(configName, 'rt', encoding='utf-8') + except: + import io + config = io.open(configName, 'rt', encoding='utf-8') res = {} res['model_name'] = os.path.join(root_dir, 'temp.bin') for line in config.readlines(): diff --git a/pymnn/test/unit_test.py b/pymnn/test/unit_test.py index 4d88657f..ac94db91 100644 --- a/pymnn/test/unit_test.py +++ b/pymnn/test/unit_test.py @@ -465,6 +465,14 @@ class UnitTest(unittest.TestCase): self.assertEqualVar(expr.range(start, limit, delta), np.arange(0.0, 2.0, 0.3)) def test_depth_to_space(self): self.assertEqualVar(expr.depth_to_space(self.x, 2), torch.pixel_shuffle(self._x, 2)) + def test_sort(self): + x = mp.array([5, -1, 2, 0]) + x_ = np.array([5, -1, 2, 0]) + self.assertEqualVar(expr.sort(x), np.sort(x_)) + def test_raster(self): + x = mp.array([[1, 2], [3, 4]]) + x_ = np.array([[1, 2], [3, 4]]) + self.assertEqualVar(expr.raster([x], [0, 1, 1, 2, 0, 1, 2, 1, 1, 2, 2], [2, 2]), x_.transpose()) def test_detection_post_process(self): pass # test cv @@ -643,6 +651,40 @@ class UnitTest(unittest.TestCase): x = cv.threshold(self.imgf, 50, 20, cv.THRESH_BINARY) y = cv2.threshold(self.imgf_, 50, 20, cv2.THRESH_BINARY)[1] self.assertEqualImg(x, y) + # draw + def test_Draw(self): + x = self.img.copy() + y = self.img_.copy() + # 1. arrowedLine + cv.arrowedLine(x, [10, 10], [40, 40], [255, 0, 0]) + cv2.arrowedLine(y, [10, 10], [40, 40], [255, 0, 0]) + # 2. line + cv.line(x, [20, 30], [50, 60], [0, 0, 255]) + cv2.line(y, [20, 30], [50, 60], [0, 0, 255]) + # 3. circle + cv.circle(x, [70, 70], 30, [0, 255, 0]) + cv2.circle(y, [70, 70], 30, [0, 255, 0]) + # 4. rectangle + cv.rectangle(x, [80, 80], [120, 120], [0, 0, 255]) + cv2.rectangle(y, [80, 80], [120, 120], [0, 0, 255]) + # get contours + y_ = cv2.cvtColor(y, cv2.COLOR_BGR2GRAY) + y_ = cv2.threshold(y_, 127, 255, cv2.THRESH_BINARY)[1] + c_, _ = cv2.findContours(y_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + c = [] + for a in c_: + ps = [] + for b in a: + ps.append(int(b[0,0])) + ps.append(int(b[0,1])) + c.append(ps) + # 5. fillPoly + cv.fillPoly(x, c, [255, 0, 0]) + cv2.fillPoly(y, c_, [255, 0, 0]) + # 6. drawContours + cv.drawContours(x, c, -1, [0, 0, 255]) + cv2.drawContours(y, c_, -1, [0, 0, 255]) + self.assertEqualImg(x, y) # structural def test_Structural(self): x = mp.array([[0,0,0,0,0,0,0,0,0,0,0,0,0], @@ -661,17 +703,20 @@ class UnitTest(unittest.TestCase): contours_, _ = cv2.findContours(x_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) contour = contours[0] contour_ = contours_[0] - self.assertEqualPoints(contour, contour_) + self.assertEqualVar(contour, contour_) self.assertEqual(cv.contourArea(contour), cv2.contourArea(contour_)) hull = cv.convexHull(contour) hull_ = cv2.convexHull(contour_) - self.assertEqualPoints(hull, hull_) + if version_info.major < 3: hull_ = np.concatenate([hull_[-1::, :], hull_[:-1,:]]) + self.assertEqualVar(hull, hull_) rect = cv.minAreaRect(contour) rect_ = cv2.minAreaRect(contour_) - self.assertEqual(rect, rect_) - points = cv.boxPoints(rect), + if version_info.major >= 3: + self.assertEqual(rect, rect_) + points = cv.boxPoints(rect) points_ = cv2.boxPoints(rect_) - self.assertEqualPoints(points, points_) + if version_info.major >= 3: + self.assertEqualVar(points, points_) self.assertEqual(tuple(cv.boundingRect(contour)), cv2.boundingRect(contour_)) ret, labels, statsv, centroids = cv.connectedComponentsWithStats(x) ret_, labels_, statsv_, centroids_ = cv2.connectedComponentsWithStats(x_) @@ -689,6 +734,16 @@ class UnitTest(unittest.TestCase): x = cv.hconcat([self.img, self.img]) y = cv2.hconcat([self.img_, self.img_]) self.assertEqualImg(x, y) + def test_rotate(self): + x = cv.rotate(self.img, cv.ROTATE_90_CLOCKWISE) + y = cv2.rotate(self.img_, cv2.ROTATE_90_CLOCKWISE) + self.assertEqualImg(x, y) + x = cv.rotate(self.img, cv.ROTATE_180) + y = cv2.rotate(self.img_, cv2.ROTATE_180) + self.assertEqualImg(x, y) + x = cv.rotate(self.img, cv.ROTATE_90_COUNTERCLOCKWISE) + y = cv2.rotate(self.img_, cv2.ROTATE_90_COUNTERCLOCKWISE) + self.assertEqualImg(x, y) # numpy def test_from_shape_or_value(self): x = mp.zeros([2, 2]) @@ -724,6 +779,9 @@ class UnitTest(unittest.TestCase): self.assertEqualVar(mp.linspace(2.0, 3.0, num=5, endpoint=False), np.linspace(2.0, 3.0, num=5, endpoint=False)) self.assertEqualVar(mp.logspace(2.0, 3.0, num=4, endpoint=False), np.logspace(2.0, 3.0, num=4, endpoint=False)) self.assertEqualVar(mp.geomspace(1, 1000, num=4, endpoint=False), np.geomspace(1, 1000, num=4, endpoint=False)) + x = mp.arange(-5, 5., 0.1) + y = np.arange(-5, 5., 0.1) + self.assertEqualVars(mp.meshgrid(x, x), np.meshgrid(y, y)) def test_changing_array_shape(self): x = mp.zeros((3, 2)) x_ = np.zeros((3, 2)) @@ -916,6 +974,11 @@ class UnitTest(unittest.TestCase): self.assertEqualShape(mp.random.randn(2,3).shape, np.random.randn(2,3).shape) self.assertEqualShape(mp.random.rand(3,2).shape, np.random.rand(3,2).shape) self.assertEqualShape(mp.random.randint(0, 2, [2,3]).shape, np.random.randint(0, 2, [2,3]).shape) + def test_sorting(self): + x = mp.array([[1,0,3], [0,6,5]]) + x_ = np.array([[1,0,3], [0,6,5]]) + self.assertEqualVar(mp.sort(x), np.sort(x_)) + self.assertEqualVar(mp.argsort(x), np.argsort(x_)) def test_searching_counting(self): x = mp.array([[1,0,3], [0,6,5]]) x_ = np.array([[1,0,3], [0,6,5]]) @@ -980,10 +1043,12 @@ class UnitTest(unittest.TestCase): self.assertAlmostEqual(x.var(), x_.var()) self.assertEqualVar(x.var(0), x_.var(0)) self.assertEqual(len(x), len(x_)) - self.assertEqual(x[0,1].read_as_tuple()[0], x_[0,1]) + self.assertEqual(x[0,1], x_[0,1]) self.assertEqualVar(x[0], x_[0]) self.assertEqualVar(x[:], x_[:]) self.assertEqualVar(x[:1], x_[:1]) self.assertEqualVar(x[::-1], x_[::-1]) + self.assertEqualVar(x[x > 2], x_[x_ > 2]) + self.assertEqualVar(x[mp.array([1])], x_[np.array([1])]) if __name__ == '__main__': unittest.main() diff --git a/schema/current/UserDefine_generated.h b/schema/current/UserDefine_generated.h index a072be54..2143b607 100644 --- a/schema/current/UserDefine_generated.h +++ b/schema/current/UserDefine_generated.h @@ -376,13 +376,15 @@ struct ImageProcessParamT : public flatbuffers::NativeTable { int8_t paddingValue; std::vector shape; DataType outputType; + bool draw; ImageProcessParamT() : filterType(FilterType_NEAREST), sourceFormat(ImageFormatType_RGBA), destFormat(ImageFormatType_RGBA), wrap(WrapType_CLAMP_TO_EDGE), paddingValue(0), - outputType(DataType_DT_INVALID) { + outputType(DataType_DT_INVALID), + draw(false) { } }; @@ -421,6 +423,9 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { DataType outputType() const { return static_cast(GetField(22, 0)); } + bool draw() const { + return GetField(24, 0) != 0; + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyField(verifier, 4) && @@ -437,6 +442,7 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, 20) && verifier.VerifyVector(shape()) && VerifyField(verifier, 22) && + VerifyField(verifier, 24) && verifier.EndTable(); } ImageProcessParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -477,6 +483,9 @@ struct ImageProcessParamBuilder { void add_outputType(DataType outputType) { fbb_.AddElement(22, static_cast(outputType), 0); } + void add_draw(bool draw) { + fbb_.AddElement(24, static_cast(draw), 0); + } explicit ImageProcessParamBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -500,7 +509,8 @@ inline flatbuffers::Offset CreateImageProcessParam( flatbuffers::Offset> transform = 0, int8_t paddingValue = 0, flatbuffers::Offset> shape = 0, - DataType outputType = DataType_DT_INVALID) { + DataType outputType = DataType_DT_INVALID, + bool draw = false) { ImageProcessParamBuilder builder_(_fbb); builder_.add_outputType(outputType); builder_.add_shape(shape); @@ -509,6 +519,7 @@ inline flatbuffers::Offset CreateImageProcessParam( builder_.add_mean(mean); builder_.add_destFormat(destFormat); builder_.add_sourceFormat(sourceFormat); + builder_.add_draw(draw); builder_.add_paddingValue(paddingValue); builder_.add_wrap(wrap); builder_.add_filterType(filterType); @@ -597,6 +608,7 @@ inline void ImageProcessParam::UnPackTo(ImageProcessParamT *_o, const flatbuffer { auto _e = paddingValue(); _o->paddingValue = _e; }; { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } }; { auto _e = outputType(); _o->outputType = _e; }; + { auto _e = draw(); _o->draw = _e; }; } inline flatbuffers::Offset ImageProcessParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ImageProcessParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -617,6 +629,7 @@ inline flatbuffers::Offset CreateImageProcessParam(flatbuffer auto _paddingValue = _o->paddingValue; auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0; auto _outputType = _o->outputType; + auto _draw = _o->draw; return MNN::CreateImageProcessParam( _fbb, _filterType, @@ -628,7 +641,8 @@ inline flatbuffers::Offset CreateImageProcessParam(flatbuffer _transform, _paddingValue, _shape, - _outputType); + _outputType, + _draw); } inline const flatbuffers::TypeTable *SampleModeTypeTable() { @@ -803,7 +817,8 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() { { flatbuffers::ET_FLOAT, 1, -1 }, { flatbuffers::ET_CHAR, 0, -1 }, { flatbuffers::ET_INT, 1, -1 }, - { flatbuffers::ET_INT, 0, 3 } + { flatbuffers::ET_INT, 0, 3 }, + { flatbuffers::ET_BOOL, 0, -1 } }; static const flatbuffers::TypeFunction type_refs[] = { FilterTypeTypeTable, @@ -821,10 +836,11 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() { "transform", "paddingValue", "shape", - "outputType" + "outputType", + "draw" }; static const flatbuffers::TypeTable tt = { - flatbuffers::ST_TABLE, 10, type_codes, type_refs, nullptr, names + flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, names }; return &tt; } diff --git a/schema/default/UserDefine.fbs b/schema/default/UserDefine.fbs index 2b7a0ed5..f07737a5 100644 --- a/schema/default/UserDefine.fbs +++ b/schema/default/UserDefine.fbs @@ -62,4 +62,5 @@ table ImageProcessParam { paddingValue:byte = 0; shape:[int]; // shape: [N, C, H, W] outputType:DataType; + draw:bool = false; } diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp index f4bf4059..cf01b44f 100644 --- a/source/backend/cpu/CPUBackend.cpp +++ b/source/backend/cpu/CPUBackend.cpp @@ -170,7 +170,7 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p mPrecisionMode = precision; mCoreFunctions = MNNGetCoreFunctions(); mInt8CoreFunctions = MNNGetInt8CoreFunctions(); - mCache = new CPUResizeCache(this); + mCache = new CPUResizeCache; } CPUBackend::~CPUBackend() { diff --git a/source/backend/cpu/CPUImageProcess.cpp b/source/backend/cpu/CPUImageProcess.cpp index ca30a42a..cdc0d9d9 100644 --- a/source/backend/cpu/CPUImageProcess.cpp +++ b/source/backend/cpu/CPUImageProcess.cpp @@ -87,6 +87,19 @@ BLITTER CPUImageProcess::choose(ImageFormatType source, ImageFormatType dest) { return nullptr; } +BLITTER CPUImageProcess::choose(int channelByteSize) { + switch (channelByteSize) { + case 4: + return MNNC4blitH; + case 3: + return MNNC3blitH; + case 1: + return MNNC1blitH; + default: + return nullptr; + } +} + SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool identity) { if (identity) { switch (format) { @@ -271,10 +284,21 @@ static std::pair _computeClip(CV::Point* points, int iw, int ih, const } ErrorCode CPUImageProcess::onResize(const std::vector &inputs, const std::vector &outputs) { - auto input = inputs[0], output = outputs[0]; - ih = input->height(); - iw = input->width(); - ic = input->channel(); + auto input = inputs[0]; + if (input->dimensions() == 3) { + ih = input->length(0); + iw = input->length(1); + ic = input->length(2); + } else { + ih = input->height(); + iw = input->width(); + ic = input->channel(); + } + if (draw) { + blitter = choose(ic * inputs[0]->getType().bytes()); + return NO_ERROR; + } + auto output = outputs[0]; oh = output->height(); ow = output->width(); oc = output->channel(); @@ -321,15 +345,37 @@ ErrorCode CPUImageProcess::onResize(const std::vector &inputs, const s ErrorCode CPUImageProcess::onExecute(const std::vector &inputs, const std::vector &outputs) { auto source = inputs[0]->host(); - auto dest = outputs[0]->host(); + void* dest = nullptr; CV::Point points[2]; - int tileCount = UP_DIV(ow, CACHE_SIZE); auto destBytes = dtype.bytes(); - for (int dy = 0; dy < oh; ++dy) { + int tileCount = UP_DIV(ow, CACHE_SIZE); + const int* regions = nullptr; + if (draw) { + // change input to output + dest = source; + oh = inputs[1]->length(0); + ow = iw; + oc = ic; + destBytes = inputs[0]->getType().bytes(); + // draw one + tileCount = 1; + // src is color + samplerDest = inputs[2]->host(); + // get region info ptr + regions = inputs[1]->host(); + } else { + dest = outputs[0]->host(); + } + for (int i = 0; i < oh; ++i) { + int dy = draw ? regions[3 * i] : i; auto dstY = (uint8_t*)dest + dy * destBytes * ow * oc; for (int tIndex = 0; tIndex < tileCount; ++tIndex) { int xStart = tIndex * CACHE_SIZE; int count = std::min(CACHE_SIZE, ow - xStart); + if (draw) { + xStart = regions[3 * i + 1]; + count = regions[3 * i + 2] - xStart + 1; + } auto dstStart = dstY + destBytes * oc * xStart; if (!blitFloat) { @@ -340,7 +386,7 @@ ErrorCode CPUImageProcess::onExecute(const std::vector &inputs, const } // Sample - { + if (!draw) { // Compute position points[0].fX = xStart; points[0].fY = dy; diff --git a/source/backend/cpu/CPUImageProcess.hpp b/source/backend/cpu/CPUImageProcess.hpp index 91071f8c..ea8349c5 100644 --- a/source/backend/cpu/CPUImageProcess.hpp +++ b/source/backend/cpu/CPUImageProcess.hpp @@ -23,6 +23,10 @@ typedef void (*SAMPLER)(const unsigned char* source, unsigned char* dest, CV::Po class CPUImageProcess : public Execution { public: CPUImageProcess(CV::ImageProcess::Config config, const CoreFunctions* coreFunctions) : Execution(nullptr), coreFunctions(coreFunctions) { + if (config.draw) { + draw = true; + return; + } filterType = (FilterType)config.filterType; wrap = (WrapType)config.wrap; sourceFormat = (ImageFormatType)config.sourceFormat; @@ -40,6 +44,11 @@ public: paddingValue = val; } CPUImageProcess(Backend *bn, const ImageProcessParam* process) : Execution(bn) { + coreFunctions = static_cast(backend())->functions(); + draw = process->draw(); + if (draw) { + return; + } filterType = process->filterType(); wrap = process->wrap(); sourceFormat = process->sourceFormat(); @@ -53,12 +62,12 @@ public: transform.set(i, process->transform()->Get(i)); } transform.invert(&transformInvert); - coreFunctions = static_cast(backend())->functions(); } virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; private: BLITTER choose(ImageFormatType source, ImageFormatType dest); + BLITTER choose(int channelByteSize); BLIT_FLOAT choose(ImageFormatType format, int dstBpp = 0); SAMPLER choose(ImageFormatType format, FilterType type, bool identity); private: @@ -78,6 +87,7 @@ private: std::unique_ptr samplerBuffer, blitBuffer; uint8_t* samplerDest = nullptr, *blitDest = nullptr; const CoreFunctions* coreFunctions = nullptr; + bool draw = false; }; }; // namespace MNN diff --git a/source/backend/cpu/CPUNonMaxSuppressionV2.cpp b/source/backend/cpu/CPUNonMaxSuppressionV2.cpp index b50e49a0..41992e6d 100644 --- a/source/backend/cpu/CPUNonMaxSuppressionV2.cpp +++ b/source/backend/cpu/CPUNonMaxSuppressionV2.cpp @@ -117,6 +117,9 @@ ErrorCode CPUNonMaxSuppressionV2::onExecute(const std::vector& inputs, const auto scores = inputs[1]->host(); NonMaxSuppressionSingleClasssImpl(inputs[0], scores, maxDetections, iouThreshold, scoreThreshold, &selected); std::copy_n(selected.begin(), selected.size(), outputs[0]->host()); + for (int i = selected.size(); i < outputs[0]->elementSize(); i++) { + outputs[0]->host()[i] = -1; + } return NO_ERROR; } diff --git a/source/backend/cpu/CPUResizeCache.hpp b/source/backend/cpu/CPUResizeCache.hpp index d8b4dad6..aff4523d 100644 --- a/source/backend/cpu/CPUResizeCache.hpp +++ b/source/backend/cpu/CPUResizeCache.hpp @@ -6,11 +6,11 @@ #include "MNN_generated.h" namespace MNN { -class CPUBackend; -class CPUResizeCache { +// FIXME: Move outside +class MNN_PUBLIC CPUResizeCache { public: - CPUResizeCache(const CPUBackend* backend) { - mBackend = backend; + CPUResizeCache() { + // Do nothing } ~ CPUResizeCache() { // Do nothing @@ -21,7 +21,6 @@ public: void reset(); private: std::map, std::shared_ptr> mFormatCache; - const CPUBackend* mBackend; }; } diff --git a/source/backend/cpu/CPUScatterNd.cpp b/source/backend/cpu/CPUScatterNd.cpp index 7cf11755..94ac0e49 100644 --- a/source/backend/cpu/CPUScatterNd.cpp +++ b/source/backend/cpu/CPUScatterNd.cpp @@ -45,7 +45,7 @@ void ScatterNdImpl(const Tensor* indices, const Tensor* updates, const Tensor* s } if (valid) { for (int k = 0; k < accNumber; ++k) { - outputPtr[pos + k] += updatesPtr[i * accNumber + k]; + outputPtr[pos + k] = updatesPtr[i * accNumber + k]; } } } @@ -59,7 +59,12 @@ ErrorCode CPUScatterNd::onExecute(const std::vector& inputs, const std: const int outputSize = output->size(); auto outputRawPtr = output->host(); - memset(outputRawPtr, 0, outputSize); + if (inputs.size() < 4) { + memset(outputRawPtr, 0, outputSize); + } else { + auto inputRawPtr = inputs[3]->host(); + memcpy(outputRawPtr, inputRawPtr, outputSize); + } auto updatesDataType = updates->getType(); if (updatesDataType == halide_type_of()) { diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp index 206a1743..4fa30af3 100644 --- a/source/backend/cpu/compute/ImageProcessFunction.cpp +++ b/source/backend/cpu/compute/ImageProcessFunction.cpp @@ -1065,3 +1065,21 @@ void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN auto countC2 = ((count + 1) / 2); _swapUV(destUV, destUV, countC2); } + +void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count) { + for (int i = 0; i < count; i++) { + memcpy(dest + 3 * i, source, 3); + } +} + +void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count) { + for (int i = 0; i < count; i++) { + memcpy(dest + 4 * i, source, 4); + } +} + +void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count) { + for (int i = 0; i < count; i++) { + memcpy(dest + i, source, 1); + } +} diff --git a/source/backend/cpu/compute/ImageProcessFunction.hpp b/source/backend/cpu/compute/ImageProcessFunction.hpp index 13b54b7d..23c8d90a 100644 --- a/source/backend/cpu/compute/ImageProcessFunction.hpp +++ b/source/backend/cpu/compute/ImageProcessFunction.hpp @@ -132,4 +132,8 @@ void MNNSamplerNV12Copy(const unsigned char* source, unsigned char* dest, MNN::C size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride); +// draw blit +void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count); +void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count); +void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count); #endif /* ImageProcessFunction_hpp */ diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt index bfaa4efe..2ded7bdd 100644 --- a/source/backend/cpu/x86_x64/CMakeLists.txt +++ b/source/backend/cpu/x86_x64/CMakeLists.txt @@ -1,29 +1,72 @@ +# Process asm file on Windows, then subsitute *.S by *.S.obj as source file of add_library +# If MNN_ASSEMBLER env var is not set, ignore *.S file, which may cause low performance +set(EXTRA_OBJS "") +IF(MSVC AND (DEFINED ENV{MNN_ASSEMBLER}) AND "${CMAKE_SIZEOF_VOID_P}" STREQUAL "8") + set(WIN_USE_ASM ON) +ENDIF() +message(STATUS "WIN_USE_ASM: ${WIN_USE_ASM}") +function (process_asm TARGET_NAME FILE_SRCS) + if(NOT MSVC) + return() + endif() + set(FILE_DESTS "") + foreach(SRC ${${FILE_SRCS}}) + get_filename_component(SRC_EXT ${SRC} EXT) + if(NOT ${SRC_EXT} STREQUAL ".S") + list(APPEND FILE_DESTS ${SRC}) + continue() + elseif(NOT WIN_USE_ASM) + continue() + endif() + string(REPLACE ${CMAKE_CURRENT_SOURCE_DIR} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TARGET_NAME}.dir" DEST ${SRC}) + add_custom_command( + OUTPUT ${DEST}.obj + # *.S -> *.S.i: do preprocess(define/ifdef macro) by cl.exe + COMMAND "${CMAKE_C_COMPILER}" /DWIN32 /experimental:preprocessor /P /Fi"${DEST}.i" "${SRC}" + # *.S.i -> *.S.obj, use gnu assembler which support (AT&T syntax) + COMMAND "$ENV{MNN_ASSEMBLER}" -o "${DEST}.obj" "${DEST}.i" + ) + list(APPEND EXTRA_OBJS ${DEST}.obj) + endforeach() + set(${FILE_SRCS} ${FILE_DESTS} PARENT_SCOPE) + set(EXTRA_OBJS ${EXTRA_OBJS} PARENT_SCOPE) +endfunction() + if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)") message(STATUS "${CMAKE_SYSTEM_PROCESSOR}: Open SSE") target_compile_options(MNNCPU PRIVATE -DMNN_USE_SSE) option(MNN_AVX512_VNNI "Enable AVX512 VNNI" ON) FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*) - if (MSVC) - FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp) - FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp) - else() - FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*) - FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*) - message(STATUS "MNN_AVX512:${MNN_AVX512}") - if (MNN_AVX512) - FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*) - SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp) - LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC}) - add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC}) - target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma) - if (MNN_AVX512_VNNI) - target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI) - add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC}) - target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni -DMNN_AVX512_VNNI) + FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*) + FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*) + message(STATUS "MNN_AVX512:${MNN_AVX512}") + if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM)) + FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*) + SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp) + LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC}) + process_asm(MNNAVX512 MNN_AVX512_SRC) + add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC}) + target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM) + if (MSVC) + target_compile_options(MNNAVX512 PRIVATE /arch:AVX512) + else() + target_compile_options(MNNAVX512 PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma) + endif() + if (MNN_AVX512_VNNI) + target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI) + add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC}) + target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_AVX512_VNNI) + if (MSVC) + target_compile_options(MNNAVX512 PRIVATE /arch:AVX512) + else() + target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni) endif() endif() endif() FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*) + process_asm(MNNAVX MNN_AVX_SRC) + process_asm(MNNAVXFMA MNN_AVXFMA_SRC) + process_asm(MNNSSE MNN_SSE_SRC) add_library(MNNX8664 OBJECT ${MNN_X8664_SRC}) add_library(MNNAVX OBJECT ${MNN_AVX_SRC}) add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC}) @@ -34,7 +77,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64) target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE) if(MSVC) target_compile_options(MNNAVX PRIVATE /arch:AVX) - target_compile_options(MNNAVXFMA PRIVATE /arch:AVX) + target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2) else() target_compile_options(MNNSSE PRIVATE -msse4.1) target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM) @@ -47,7 +90,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64) endif() endif() list(APPEND MNN_OBJECTS_TO_LINK $ $ $ $) - if (MNN_AVX512) + if (MSVC AND WIN_USE_ASM) + target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM) + target_compile_options(MNNAVXFMA PRIVATE -DMNN_X86_USE_ASM) + list(APPEND MNN_OBJECTS_TO_LINK ${EXTRA_OBJS}) + endif() + if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM)) target_compile_options(MNNCPU PRIVATE -DMNN_AVX512) target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512) if (MNN_AVX512_VNNI) diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S index 473048d0..73cbcc02 100644 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S +++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S @@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post -// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter +// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides pushq %rbp movq %rsp, %rbp #ifdef WIN32 -movq 48(%rsp), %r10 +#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space +movq (push_registers_bytes)(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -41,6 +42,17 @@ movq %r9, %rcx movq %r10, %r9 pushq %r14 pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 @@ -304,6 +316,17 @@ addq $64, %rsp End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r15 popq %r14 popq %r13 diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S index 91265c54..cb6a7690 100644 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S +++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S @@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1 // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post -// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter +// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides pushq %rbp movq %rsp, %rbp #ifdef WIN32 -movq 48(%rsp), %r10 +#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space +movq (push_registers_bytes)(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -41,6 +42,17 @@ movq %r9, %rcx movq %r10, %r9 pushq %r14 pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 @@ -190,6 +202,17 @@ addq $64, %rsp End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r15 popq %r14 popq %r13 diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S index fb2a3d96..514a4d00 100644 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S +++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S @@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1EFMA_ASM // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters +// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters // all callee save regs: // %rbx, %rbp, %r12~%r15 // unused para regs: %r8, %r9 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax - - pushq %rbp movq %rsp, %rbp + +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else pushq %rax pushq %rbx pushq %r8 @@ -42,7 +66,7 @@ pushq %r12 pushq %r13 pushq %r14 pushq %r15 - +#endif movq (%rdi), %rax // %rax C movq 8(%rdi), %rbx // %rbx A @@ -215,6 +239,27 @@ LoopE24H1: jmp LoopE24H1 End: + +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +popq %rsi +popq %rdi +#else popq %r15 popq %r14 popq %r13 @@ -223,6 +268,8 @@ popq %r9 popq %r8 popq %rbx popq %rax +#endif + popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S index 8fd31b41..02191c15 100644 --- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S +++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S @@ -30,10 +30,33 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4EFMA_ASM // %rbx, %rbp, %r12~%r15 // unused para regs: %r8, %r9 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax - - pushq %rbp movq %rsp, %rbp + +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else pushq %rax pushq %rbx pushq %r8 @@ -42,6 +65,7 @@ pushq %r12 pushq %r13 pushq %r14 pushq %r15 +#endif movq (%rdi), %rax // %rax C movq 8(%rdi), %rbx // %rbx A @@ -216,6 +240,26 @@ LoopE24H4: jmp LoopE24H4 End: +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +popq %rsi +popq %rdi +#else popq %r15 popq %r14 popq %r13 @@ -224,6 +268,8 @@ popq %r9 popq %r8 popq %rbx popq %rax +#endif + popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp index 9bac9e53..5bf63b92 100644 --- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp +++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp @@ -26,23 +26,29 @@ constexpr int AVX512F32 = 16; _mm_store_ps(dest + AVX512F32 * packCUnit * ablock + 4 * packCUnit * aSegment + packCUnit * 3, m128_3); \ } -#define STORE_VECTOR_AS_COLUMN(dest, ablock, packCUnit, vacc) \ - dest[AVX512F32 * packCUnit * ablock + 0] = vacc[0]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit] = vacc[1]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 2] = vacc[2]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 3] = vacc[3]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 4] = vacc[4]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 5] = vacc[5]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 6] = vacc[6]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 7] = vacc[7]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 8] = vacc[8]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 9] = vacc[9]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc[10]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc[11]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc[12]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc[13]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc[14]; \ - dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc[15]; +inline void STORE_VECTOR_AS_COLUMN(float* dest, size_t ablock, size_t packCUnit, __m512 vacc) { + union { + __m512 v; + float f[16]; + } vacc_u; + vacc_u.v = vacc; + dest[AVX512F32 * packCUnit * ablock + 0] = vacc_u.f[0]; + dest[AVX512F32 * packCUnit * ablock + packCUnit] = vacc_u.f[1]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 2] = vacc_u.f[2]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 3] = vacc_u.f[3]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 4] = vacc_u.f[4]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 5] = vacc_u.f[5]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 6] = vacc_u.f[6]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 7] = vacc_u.f[7]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 8] = vacc_u.f[8]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 9] = vacc_u.f[9]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc_u.f[10]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc_u.f[11]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc_u.f[12]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc_u.f[13]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc_u.f[14]; + dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc_u.f[15]; +} #define TRANSPOSE4x8_STORE(dest, ablock, aSegment, packCUnit, v0, v3, v6, v9, v12, v15, v18, v21) { \ auto m0 = _mm512_extractf32x4_ps(v0, aSegment); \ @@ -125,14 +131,20 @@ constexpr int AVX512F32 = 16; _mm256_storeu_ps(dest + packCUnit * 7, t7); \ } -#define STORE_M256_VECTOR_AS_COLUMN(dest, packCUnit, vacc) \ - dest[0] = vacc[0]; \ - dest[packCUnit] = vacc[1]; \ - dest[packCUnit * 2] = vacc[2]; \ - dest[packCUnit * 3] = vacc[3]; \ - dest[packCUnit * 4] = vacc[4]; \ - dest[packCUnit * 5] = vacc[5]; \ - dest[packCUnit * 6] = vacc[6]; \ - dest[packCUnit * 7] = vacc[7]; +inline void STORE_M256_VECTOR_AS_COLUMN(float* dest, size_t packCUnit, __m256 vacc) { + union { + __m256 v; + float f[8]; + } vacc_u; + vacc_u.v = vacc; + dest[0] = vacc_u.f[0]; + dest[packCUnit] = vacc_u.f[1]; + dest[packCUnit * 2] = vacc_u.f[2]; + dest[packCUnit * 3] = vacc_u.f[3]; + dest[packCUnit * 4] = vacc_u.f[4]; + dest[packCUnit * 5] = vacc_u.f[5]; + dest[packCUnit * 6] = vacc_u.f[6]; + dest[packCUnit * 7] = vacc_u.f[7]; +} -#endif +#endif \ No newline at end of file diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp index 4e788f04..adce20a1 100644 --- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp +++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp @@ -228,9 +228,14 @@ void _AVX512_MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B, vacc0 = _mm256_min_ps(vacc0, _mm512_extractf32x8_ps(vmax, 0)); vacc0 = _mm256_max_ps(vacc0, _mm512_extractf32x8_ps(vmin, 0)); + union { + __m256 v; + float f[8]; + } vacc0_u; + vacc0_u.v = vacc0; // how to store faster: st4 / transpose for (auto iStore = 0; iStore < (taileSize & 0x07); iStore++) { - c[packCUnit * iStore] = vacc0[iStore]; + c[packCUnit * iStore] = vacc0_u.f[iStore]; } } // ie += taileSize; diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp index e602718f..7cbd097a 100644 --- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp +++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp @@ -647,10 +647,15 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0)); vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0)); - c[0] = vacc0[0]; - c[packCUnit] = vacc0[1]; - c[packCUnit * 2] = vacc0[2]; - c[+packCUnit * 3] = vacc0[3]; + union { + __m128 v; + float f[4]; + } vacc0_u; + vacc0_u.v = vacc0; + c[0] = vacc0_u.f[0]; + c[packCUnit] = vacc0_u.f[1]; + c[packCUnit * 2] = vacc0_u.f[2]; + c[+packCUnit * 3] = vacc0_u.f[3]; } ie += 4; a += 4; @@ -735,8 +740,13 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0)); vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0)); - c[0] = vacc0[0]; - c[packCUnit] = vacc0[1]; + union { + __m128 v; + float f[4]; + } vacc0_u; + vacc0_u.v = vacc0; + c[0] = vacc0_u.f[0]; + c[packCUnit] = vacc0_u.f[1]; } ie += 2; a += 2; diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp index 3d22dd16..5f7ffa94 100644 --- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp +++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp @@ -789,10 +789,15 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B, vacc0 = _mm_min_ps(vacc0, vmax); vacc0 = _mm_max_ps(vacc0, vmin); - c[0] = vacc0[0]; - c[packCUnit] = vacc0[1]; - c[packCUnit * 2] = vacc0[2]; - c[packCUnit * 3] = vacc0[3]; + union { + __m128 v; + float f[4]; + } vacc0_u; + vacc0_u.v = vacc0; + c[0] = vacc0_u.f[0]; + c[packCUnit] = vacc0_u.f[1]; + c[packCUnit * 2] = vacc0_u.f[2]; + c[packCUnit * 3] = vacc0_u.f[3]; } ie += 4; a += 4; @@ -877,8 +882,13 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B, vacc0 = _mm_min_ps(vacc0, vmax); vacc0 = _mm_max_ps(vacc0, vmin); - c[0] = vacc0[0]; - c[packCUnit] = vacc0[1]; + union { + __m128 v; + float f[4]; + } vacc0_u; + vacc0_u.v = vacc0; + c[0] = vacc0_u.f[0]; + c[packCUnit] = vacc0_u.f[1]; } ie += 2; a += 2; diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S index 874076da..57e3c339 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S @@ -11,16 +11,15 @@ .align 4 asm_function _AVX512_MNNGemmFloatUnit16x8 -//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4) +//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter) -// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4 +// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter pushq %rbp movq %rsp, %rbp pushq %rbx #ifdef WIN32 -movq 48(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -30,12 +29,21 @@ movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx -movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 pushq %r14 -movq %r8, %r9 #endif movq 40(%rcx), %r10 // bExtraStride @@ -266,6 +274,17 @@ LoopDz: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r14 popq %r13 popq %r12 diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S index ada1c521..d3ff9575 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S @@ -11,16 +11,15 @@ .align 4 asm_function _AVX512_MNNGemmFloatUnit32x8 -//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4) +//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter) -// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4 +// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter pushq %rbp movq %rsp, %rbp pushq %rbx #ifdef WIN32 -movq 48(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -30,12 +29,21 @@ movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx -movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 pushq %r14 -movq %r8, %r9 #endif movq 40(%rcx), %r10 // bExtraStride @@ -301,6 +309,17 @@ LoopDz: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r14 popq %r13 popq %r12 diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S index 6fc81941..72068a1e 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S @@ -11,16 +11,15 @@ .align 4 asm_function _AVX512_MNNGemmFloatUnit48x8 -//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4) +//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter) -// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4 +// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter pushq %rbp movq %rsp, %rbp pushq %rbx #ifdef WIN32 -movq 48(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -29,11 +28,20 @@ movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx -movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 -movq %r8, %r9 #endif movq 40(%rcx), %r10 // bExtraStride @@ -336,10 +344,22 @@ LoopDz: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r13 popq %r12 popq %rsi popq %rdi +popq %rbx popq %rbp #else popq %r13 diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S index a5396e32..da85739b 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S @@ -14,9 +14,22 @@ asm_function _AVX512_MNNGemmFloatUnit48x8Fused //void _AVX512_MNNGemmFloatUnit48x8Fused(float* C, const float* A, const float* B, const size_t* parameter, const float* p, const float* bias) // SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: postParameters, r9:bias + +// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter +// stack: postParameters, bias pushq %rbp movq %rsp, %rbp +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space +movq (push_registers_bytes)(%rsp), %r8 // postParameters +movq (push_registers_bytes + 8)(%rsp), %r9 // bias pushq %rbx pushq %r12 pushq %r13 @@ -24,6 +37,26 @@ pushq %r14 pushq %r15 movq %r8, %r14 movq %r9, %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +movq %r8, %r14 +movq %r9, %r15 +#endif movq 40(%rcx), %r10 // bExtraStride movq 24(%rcx), %r8 // cStride @@ -402,12 +435,33 @@ LoopDz: End: +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx -popq %rbp +popq %rsi +popq %rdi +#else +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +#endif +popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S index cfca491b..384e80a6 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S @@ -12,8 +12,6 @@ #define AVX512F32 16 -#define push_registers_bytes ((9 + 1) * 8) // pushq + callq - // caution: asm version is a sub-loop of _AVX512_MNNPackedSparseMatMulEpx4() // void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, // const float* postParameters, const float* bias, unsigned int* NNZMap, @@ -22,8 +20,29 @@ asm_function _AVX512_MNNPackedSparseMatMulEpx4_ASM // SystemV Auto: rdi: C, rsi: A, rdx:B, rcx: eSize, r8: parameter, r9: postparameter, // stack: bias, unsigned int* NNZMap, int* dataOffsetMap +// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:eSize +// stack: parameter, postParameters, bias, unsigned int* NNZMap, int* dataOffsetMap + pushq %rbp movq %rsp, %rbp + +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +#define push_registers_bytes_ ((8 + 1) * 8 + 32) // pushq + callq + shadow_space +movq (push_registers_bytes_)(%rsp), %r8 // parameter +movq (push_registers_bytes_ + 8)(%rsp), %r9 // postparameter +#define push_registers_bytes (push_registers_bytes_ + 2 * 8) // pushq + callq + shadow_space + extra +#else pushq %rax pushq %rbx pushq %r8 @@ -32,7 +51,8 @@ pushq %r12 pushq %r13 pushq %r14 pushq %r15 - +#define push_registers_bytes ((9 + 1) * 8) // pushq + callq +#endif movq (%r8), %r10 // eP * sizeof shrq $(sizeof_value_lg2), %r10 @@ -65,8 +85,8 @@ vbroadcastss 8(%r9), %zmm10 vbroadcastss 12(%r9), %zmm11 movq %r10, %r14 -shrq $sparse_blockoc_log, %r14 -shlq $sparse_blockoc_log, %r14 // h even divid sparse_blockoc +shrq $(sparse_blockoc_log), %r14 +shlq $(sparse_blockoc_log), %r14 // h even divid sparse_blockoc movq (push_registers_bytes)(%rsp), %rdx // bias movq (push_registers_bytes + 8)(%rsp), %rdi // unsigned int* NNZMap, @@ -79,6 +99,20 @@ movq (push_registers_bytes + 16)(%rsp), %rsi // int* dataOffsetMap // movq %r8, %rdi // movq %r9, %rsi +#ifdef WIN32 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#endif + movslq (%rsi), %r15 leaq (%rax, %r15, 4), %rax // a = a + diff; addq $4, %rsi // dataOffsetMap++ @@ -90,7 +124,7 @@ je loop_e48h4_end loop_e48h4: movq %r8, %r9 movq %r8, %r12 - shrq $packC_unit_log, %r9 + shrq $(packC_unit_log), %r9 andq $15, %r12 // ih % packC_unit leaq (%rcx, %r12, sizeof_value), %r12 imulq %r11, %r9 // (ih >> packC_unit_log) * cStride @@ -246,7 +280,7 @@ loop_e48h4: subq $4, %rsi // dataOffsetMap-- movslq (%rsi), %r15 - addq $sparse_blockoc, %r8 + addq $(sparse_blockoc), %r8 addq $4, %rdi negq %r15 leaq (%rax, %r15, sizeof_value), %rax // a = a - diff; @@ -284,7 +318,7 @@ je loop_end loop_e48h1: movq %r8, %r9 movq %r8, %r12 - shrq $packC_unit_log, %r9 + shrq $(packC_unit_log), %r9 andq $15, %r12 // ih % packC_unit leaq (%rcx, %r12, sizeof_value), %r12 imulq %r11, %r9 // (ih >> packC_unit_log) * cStride @@ -433,15 +467,37 @@ loop_e48h1_end: loop_end: -popq %r15 -popq %r14 -popq %r13 -popq %r12 -popq %r9 -popq %r8 -popq %rbx -popq %rax -popq %rbp +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +popq %rsi +popq %rdi +#else +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %r9 +popq %r8 +popq %rbx +popq %rax +#endif + +popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S b/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S index 15b497e2..ab730d30 100644 --- a/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S +++ b/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S @@ -21,7 +21,6 @@ pushq %rbp movq %rsp, %rbp #ifdef WIN32 -movq 48(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -31,7 +30,17 @@ movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx -movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 @@ -179,6 +188,17 @@ Loop: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r14 popq %r13 popq %r12 diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S index de252fb4..47d1d8d5 100644 --- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S +++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S @@ -19,7 +19,8 @@ pushq %rbp movq %rsp, %rbp #ifdef WIN32 -movq 48(%rsp), %r10 +#define push_registers_bytes ((1 + 1) * 8 + 32) +movq (push_registers_bytes)(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -29,6 +30,17 @@ movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 @@ -216,6 +228,17 @@ LoopDz: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r13 popq %r12 popq %rsi diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S index e866bdb7..22e541aa 100644 --- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S +++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S @@ -19,7 +19,8 @@ pushq %rbp movq %rsp, %rbp #ifdef WIN32 -movq 48(%rsp), %r10 +#define push_registers_bytes ((1 + 1) * 8 + 32) +movq (push_registers_bytes)(%rsp), %r10 pushq %rdi pushq %rsi pushq %r12 @@ -29,6 +30,17 @@ movq %rdx, %rsi movq %r8, %rdx movq %r9, %rcx movq %r10, %r9 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) #else pushq %r12 pushq %r13 @@ -191,6 +203,17 @@ LoopDz: End: #ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r13 popq %r12 popq %rsi diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S index 8cafcd72..74fe0857 100644 --- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S +++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S @@ -18,12 +18,41 @@ asm_function _AVX_MNNGemmFloatUnitMainFMA_Fused pushq %rbp movq %rsp, %rbp +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space +movq (push_registers_bytes)(%rsp), %r8 +movq (push_registers_bytes + 8)(%rsp), %r9 pushq %r12 pushq %r13 pushq %r14 pushq %r15 movq %r8, %r14 movq %r9, %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +movq %r8, %r14 +movq %r9, %r15 +#endif movq 40(%rcx), %r10 // bExtraStride movq 24(%rcx), %r8 // cStride @@ -232,10 +261,30 @@ LoopDz: End: +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp popq %r15 popq %r14 popq %r13 popq %r12 +popq %rsi +popq %rdi +#else +popq %r15 +popq %r14 +popq %r13 +popq %r12 +#endif popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S index 2dec6291..66fbc798 100644 --- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S +++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S @@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1NFMA_ASM // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters +// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters // all callee save regs: // %rbx, %rbp, %r12~%r15 // unused para regs: %r8, %r9 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax - - pushq %rbp movq %rsp, %rbp + +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else pushq %rax pushq %rbx pushq %r8 @@ -42,6 +66,7 @@ pushq %r12 pushq %r13 pushq %r14 pushq %r15 +#endif movq (%rdi), %rax // %rax C @@ -203,6 +228,26 @@ LoopE24H1: jmp LoopE24H1 End: +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +popq %rsi +popq %rdi +#else popq %r15 popq %r14 popq %r13 @@ -211,6 +256,8 @@ popq %r9 popq %r8 popq %rbx popq %rax +#endif + popq %rbp retq diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S index 9d97066f..e953d19d 100644 --- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S +++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S @@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4NFMA_ASM // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters +// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters // all callee save regs: // %rbx, %rbp, %r12~%r15 // unused para regs: %r8, %r9 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax - - pushq %rbp movq %rsp, %rbp + +#ifdef WIN32 +pushq %rdi +pushq %rsi +movq %rcx, %rdi +movq %rdx, %rsi +movq %r8, %rdx +movq %r9, %rcx +pushq %rbx +pushq %r12 +pushq %r13 +pushq %r14 +pushq %r15 +leaq (-1280)(%rsp), %rsp +vmovdqu %xmm6, (128*0)(%rsp) +vmovdqu %xmm7, (128*1)(%rsp) +vmovdqu %xmm8, (128*2)(%rsp) +vmovdqu %xmm9, (128*3)(%rsp) +vmovdqu %xmm10, (128*4)(%rsp) +vmovdqu %xmm11, (128*5)(%rsp) +vmovdqu %xmm12, (128*6)(%rsp) +vmovdqu %xmm13, (128*7)(%rsp) +vmovdqu %xmm14, (128*8)(%rsp) +vmovdqu %xmm15, (128*9)(%rsp) +#else pushq %rax pushq %rbx pushq %r8 @@ -42,6 +66,7 @@ pushq %r12 pushq %r13 pushq %r14 pushq %r15 +#endif movq (%rdi), %rax // %rax C movq 8(%rdi), %rbx // %rbx A @@ -195,6 +220,26 @@ LoopE24H4: jmp LoopE24H4 End: +#ifdef WIN32 +vmovdqu (128*0)(%rsp), %xmm6 +vmovdqu (128*1)(%rsp), %xmm7 +vmovdqu (128*2)(%rsp), %xmm8 +vmovdqu (128*3)(%rsp), %xmm9 +vmovdqu (128*4)(%rsp), %xmm10 +vmovdqu (128*5)(%rsp), %xmm11 +vmovdqu (128*6)(%rsp), %xmm12 +vmovdqu (128*7)(%rsp), %xmm13 +vmovdqu (128*8)(%rsp), %xmm14 +vmovdqu (128*9)(%rsp), %xmm15 +leaq (1280)(%rsp), %rsp +popq %r15 +popq %r14 +popq %r13 +popq %r12 +popq %rbx +popq %rsi +popq %rdi +#else popq %r15 popq %r14 popq %r13 @@ -203,6 +248,8 @@ popq %r9 popq %r8 popq %rbx popq %rax +#endif + popq %rbp retq diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt index 027051b3..53abc702 100644 --- a/source/backend/cuda/CMakeLists.txt +++ b/source/backend/cuda/CMakeLists.txt @@ -56,15 +56,15 @@ message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!") if(WIN32) cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC}) - string(REPLACE "cublas.lib" "cudnn.lib" CUDNN_LIBRARIES ${CUDA_CUBLAS_LIBRARIES}) - set(MNN_CUDA_LIBS MNN_CUDA ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES} PARENT_SCOPE) + set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE) else() cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC}) - set(MNN_CUDA_LIBS MNN_Cuda_Main cudnn cublas PARENT_SCOPE) + set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE) add_library(MNN_CUDA OBJECT Register.cpp) endif() include_directories( + ${CMAKE_CURRENT_LIST_DIR}/ ${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/include/ ) diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp index 5e26acdb..8e5cc7e6 100644 --- a/source/backend/cuda/core/CUDABackend.cpp +++ b/source/backend/cuda/core/CUDABackend.cpp @@ -14,6 +14,11 @@ #include "core/Macro.h" #include "shape/SizeComputer.hpp" #include "core/TensorUtils.hpp" +#include "execution/Raster.cuh" +#include "execution/Transpose.cuh" +#include "execution/MNNCUDADefine.hpp" + +// #define MNN_CUDA_COPY_DEBUG namespace MNN { namespace CUDA { @@ -30,22 +35,18 @@ public: // Do nothing } virtual ~ CUDARuntimeAllocator() = default; - virtual std::pair onAlloc(int size, int align) override { + virtual std::pair onAlloc(size_t size, size_t align) override { return std::make_pair(mRuntime->alloc(size), 0); } - virtual void onRelease(std::pair ptr) override { + virtual void onRelease(std::pair ptr) override { mRuntime->free(ptr.first); } private: CUDARuntime* mRuntime; }; CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) { - // Shader precision - if (precision == BackendConfig::Precision_Low) { - mCUDARuntime.reset(new CUDARuntime(true, -1)); - } else { - mCUDARuntime.reset(new CUDARuntime(false, -1)); - } + // TODO: Search CUDA Device info and use best one + mCUDARuntime.reset(new CUDARuntime(-1)); if (mCUDARuntime.get()) { if (mCUDARuntime->isCreateError() == true) { mIsCreateError = true; @@ -54,6 +55,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B std::shared_ptr allocator(new CUDARuntimeAllocator(mCUDARuntime.get())); mBufferPool.reset(new BufferAllocator(allocator)); } + mDefaultPrecision = precision; } CUDARuntimeWrapper::~CUDARuntimeWrapper() { // Do nothing @@ -64,7 +66,12 @@ float CUDARuntimeWrapper::onGetMemoryInMB() { } Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const { - return new CUDABackend(mBufferPool, mCUDARuntime); + auto mode = mDefaultPrecision; + if (nullptr != config) { + mode = config->precision; + } + bool useFp16 = mode == BackendConfig::Precision_Low; + return new CUDABackend(mBufferPool, mCUDARuntime, useFp16); } void CUDARuntimeWrapper::onGabageCollect(int level) { @@ -72,11 +79,12 @@ void CUDARuntimeWrapper::onGabageCollect(int level) { } CUDABackend::CUDABackend(std::shared_ptr st, - std::shared_ptr rt) + std::shared_ptr rt, bool useFp16AsFp32) : Backend(MNN_FORWARD_CUDA) { mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get()))); mStaticBufferPool = st; mCUDARuntime = rt; + mUseFp16AsFp32 = useFp16AsFp32; } CUDABackend::~CUDABackend() { @@ -89,6 +97,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() { MNN_ASSERT(nullptr != mCUDARuntime.get()); return mCUDARuntime.get(); } +bool CUDABackend::useFp16() const { + return mUseFp16AsFp32; +} class CUDAMemObj : public Backend::MemObj { public: @@ -103,12 +114,27 @@ private: BufferAllocator* mAllocator; std::pair mPoint; }; +int CUDABackend::getBytes(const Tensor* tensor) const { + auto bytes = tensor->getType().bytes(); + if (mUseFp16AsFp32) { + if (halide_type_float == tensor->getType().code) { + bytes = 2; + } + } + return bytes; +} +CPUResizeCache* CUDABackend::getCache() { + return &mCache; +} + Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) { #ifdef LOG_VERBOSE MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n"); #endif BufferAllocator* allocator = nullptr; - int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes(); + auto bytes = getBytes(nativeTensor); + size_t mallocSize = realSize(nativeTensor) * bytes; + std::pair buffer; if (storageType == DYNAMIC_SEPERATE) { buffer = mBufferPool->alloc(mallocSize, true); @@ -132,13 +158,23 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType } bool CUDABackend::onClearBuffer() { + mCache.reset(); mBufferPool->release(true); return true; } size_t CUDABackend::realSize(const Tensor* tensor) { + auto dim = TensorUtils::getDescribe(tensor)->dimensionFormat; + int pack = 1; + if (dim == MNN_DATA_FORMAT_NC4HW4) { + pack = PACK_NUMBER; + } size_t res = 1; for (int i = 0; i < tensor->dimensions(); ++i) { - res *= tensor->length(i); + size_t l = tensor->length(i); + if (1 == i ) { + l = UP_DIV(l, pack) * pack; + } + res *= l; } return res; } @@ -186,47 +222,332 @@ void CUDABackend::onExecuteBegin() const { void CUDABackend::onExecuteEnd() const { } +static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) { + if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + srcStride[0] = plane * srcPack; + srcStride[1] = plane * batch * PACK_NUMBER; + srcStride[2] = srcPack; + } else if (srcDimensionFormat == MNN_DATA_FORMAT_NCHW) { + srcStride[0] = channel * plane; + srcStride[1] = plane * PACK_NUMBER; + srcStride[2] = 1; + } else { + srcStride[0] = channel * plane; + srcStride[1] = PACK_NUMBER; + srcStride[2] = channel; + } +} + +static void _computeBCA(int& batch, int& plane, int& channel, MNN_DATA_FORMAT srcDimensionFormat, const Tensor* srcTensor) { + if (srcDimensionFormat != MNN_DATA_FORMAT_NHWC) { + batch = srcTensor->length(0); + channel = srcTensor->length(1); + plane = 1; + for (int i=2; idimensions(); ++i) { + plane *= srcTensor->length(i); + } + } else { + batch = srcTensor->length(0); + channel = srcTensor->length(srcTensor->dimensions()-1); + plane = 1; + for (int i=1; idimensions()-1; ++i) { + plane *= srcTensor->length(i); + } + } +} + +static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch, int plane, int channel) { + PackInfo pack; + pack.inside = plane; + pack.axis = channel; + pack.unit = PACK_NUMBER; + pack.outside = batch; + if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) { + pack.axisStride = 1; + pack.insideStride = channel; + } else { + pack.axisStride = plane; + pack.insideStride = 1; + } + return pack; +} void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const { auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat; - auto srcDevice = srcTensor->deviceId() != 0; - auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat; + auto srcDevice = srcTensor->deviceId() != 0; auto dstDevice = dstTensor->deviceId() != 0; - if (srcDevice && srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { - srcDimensionFormat = MNN_DATA_FORMAT_NCHW; + MNN_ASSERT(srcDevice || dstDevice); + uint8_t* srcPtr = nullptr; + std::pair tempSrcStorage; + auto bytes = getBytes(srcTensor); + auto type = srcTensor->getType(); +#ifdef MNN_CUDA_COPY_DEBUG + MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat); + for (int i=0; idimensions(); ++i) { + MNN_PRINT("%d ", srcTensor->length(i)); } - if (dstDevice && dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { - dstDimensionFormat = MNN_DATA_FORMAT_NCHW; + MNN_PRINT("]\n"); +#endif + bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1; + if (mUseFp16AsFp32) { + if ((!srcDevice) || (!dstDevice)) { + if (type.code == halide_type_float) { + directCopy = false; + } + } } - auto needSize = realSize(srcTensor) * srcTensor->getType().bytes(); - std::shared_ptr srcTempTensor; - std::shared_ptr dstTempTensor; - - if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) { - mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize, - MNNMemcpyDeviceToDevice, true); + if (directCopy) { + auto gpuSize = realSize(srcTensor) * getBytes(srcTensor); + if (srcDevice && dstDevice) { + mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize, + MNNMemcpyDeviceToDevice, true); + } else if (srcDevice && (!dstDevice)) { + mCUDARuntime->memcpy((void*)(dstTensor->host()), (void*)(srcTensor->deviceId()), gpuSize, + MNNMemcpyDeviceToHost, true); + } else if ((!srcDevice) && (dstDevice)) { + mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host()), gpuSize, + MNNMemcpyHostToDevice, true); + } + return; + } + if (!srcDevice) { + auto cpuSize = srcTensor->size(); + tempSrcStorage = mStaticBufferPool->alloc(cpuSize); + srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second; + mCUDARuntime->memcpy(srcPtr, srcTensor->host(), cpuSize, MNNMemcpyHostToDevice, + true); + } else { + srcPtr = (uint8_t*)srcTensor->deviceId(); + } + uint8_t* dstPtr = nullptr; + std::pair tempDstStorage; + if (!dstDevice) { + auto cpuSize = dstTensor->size(); + tempDstStorage = mStaticBufferPool->alloc(cpuSize); + dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second; + } else { + dstPtr = (uint8_t*)dstTensor->deviceId(); } - if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) { - if(srcDimensionFormat != dstDimensionFormat) { - dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true)); - mCUDARuntime->memcpy(dstTempTensor->host(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost, - true); - MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor); + // Format convert + FuseRegion reg; + int* size = reg.size; + int* srcStride = reg.srcStride; + int* dstStride = reg.dstStride; + int offset[PACK_NUMBER * 8]; + int offsetNumber = 0; + auto offsetGpuStorage = mStaticBufferPool->alloc(PACK_NUMBER * 8 * sizeof(int)); + auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; + auto regionStorage = mStaticBufferPool->alloc(sizeof(FuseRegion)); + auto regionGpu = (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second); + + do { + if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) { + if (srcTensor->dimensions() <= 1 || srcDimensionFormat == dstDimensionFormat) { + auto gpuSize = realSize(srcTensor) * getBytes(srcTensor); + mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize, + MNNMemcpyDeviceToDevice, true); + } else { + int batch, plane, channel; + _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor); + PackInfo pack; + auto func = PackBuffer; + if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + pack = _computePackInfo(srcDimensionFormat, batch, plane, channel); + func = PackBuffer; + } else if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + pack = _computePackInfo(dstDimensionFormat, batch, plane, channel); + func = UnpackBuffer; + } else { + FUNC_PRINT(1); + } + func((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), &pack, getBytes(srcTensor), mCUDARuntime.get()); + } + break; + } + auto convertFunction = FuseRasterBlitFloatToFloat; + if (mUseFp16AsFp32) { + if (!srcDevice) { + convertFunction = FuseRasterBlitFloatToHalf; + } else { + convertFunction = FuseRasterBlitHalfToFloat; + } + } + if (srcTensor->dimensions() <= 1) { + size[2] = srcTensor->elementSize(); + srcStride[2] = 1; + dstStride[2] = 1; + offset[0] = 1; + offset[1] = 1; + offset[2] = size[2]; + offset[3] = 0; + offset[4] = 1; + offset[5] = 1; + offset[6] = size[2]; + offset[7] = 0; + offsetNumber = 1; } else { - mCUDARuntime->memcpy(dstTensor->host(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost, - true); + // Compute batch, plane, channel + int batch, plane, channel; + _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor); + if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDevice) { + PackInfo pack = _computePackInfo(srcDimensionFormat, batch, plane, channel); + if (mUseFp16AsFp32) { + if (type.code == halide_type_float) { + if (dstDevice) { + PackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get()); + break; + } else { + PackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get()); + break; + } + } + } else { + PackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get()); + } + break; + } + if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && srcDevice) { + PackInfo pack = _computePackInfo(dstDimensionFormat, batch, plane, channel); + if (mUseFp16AsFp32) { + if (type.code == halide_type_float) { + if (dstDevice) { + UnpackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get()); + break; + } else { + UnpackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get()); + break; + } + } + } else { + UnpackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get()); + } + break; + } + //MNN_PRINT("host/device: %d -> %d, format %d -> %d, b, p, c: %d - %d - %d\n", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat, batch, plane, channel); + // Set region + if (srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) { + size[0] = batch; + size[1] = channel; + size[2] = plane; + offsetNumber = 1; + offset[0] = batch; + offset[1] = channel; + offset[2] = plane; + offset[3] = 0; + offset[4] = batch; + offset[5] = channel; + offset[6] = plane; + offset[7] = 0; + if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) { + srcStride[0] = channel * plane; + srcStride[1] = 1; + srcStride[2] = channel; + } else { + srcStride[0] = channel * plane; + srcStride[1] = plane; + srcStride[2] = 1; + } + if (dstDimensionFormat == MNN_DATA_FORMAT_NHWC) { + dstStride[0] = channel * plane; + dstStride[1] = 1; + dstStride[2] = channel; + } else { + dstStride[0] = channel * plane; + dstStride[1] = plane; + dstStride[2] = 1; + } + } else { + offsetNumber = PACK_NUMBER; + size[0] = batch; + size[1] = UP_DIV(channel, PACK_NUMBER); + size[2] = plane; + int srcPack = 1; + int dstPack = 1; + int srcChannelLimit = channel; + if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + if (srcDevice) { + srcPack = PACK_NUMBER; + srcChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER; + } else { + srcPack = 4; + srcChannelLimit = UP_DIV(channel, 4) * 4; + } + } + int dstChannelLimit = channel; + if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + if (dstDevice) { + dstPack = PACK_NUMBER; + dstChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER; + } else { + dstPack = 4; + dstChannelLimit = UP_DIV(channel, 4) * 4; + } + } + // Compute Stride + _computeStride(srcDimensionFormat, srcStride, batch, plane, channel, srcPack); + _computeStride(dstDimensionFormat, dstStride, batch, plane, channel, dstPack); + + // Compute Offset + for (int i=0; imemcpy(regionGpu, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + mCUDARuntime->memcpy(offsetGpu, offset, offsetNumber * 8 * sizeof(int), MNNMemcpyHostToDevice, true); +#ifdef MNN_CUDA_COPY_DEBUG + MNN_PRINT("Reg.size: %d - %d - %d\n", reg.size[0], reg.size[1], reg.size[2]); + MNN_PRINT("Reg.srcStride: %d - %d - %d\n", reg.srcStride[0], reg.srcStride[1], reg.srcStride[2]); + MNN_PRINT("Reg.dstStride: %d - %d - %d\n", reg.dstStride[0], reg.dstStride[1], reg.dstStride[2]); + MNN_PRINT("FuseNum: %d\n", reg.fuseNumber); + for (int i=0; ifree(offsetGpuStorage); + mStaticBufferPool->free(regionStorage); + if (!srcDevice) { + mStaticBufferPool->free(tempSrcStorage); } - if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) { - if (srcDimensionFormat != dstDimensionFormat) { - srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true)); - MNNCPUCopyBuffer(srcTensor, srcTempTensor.get()); - srcTensor = srcTempTensor.get(); - } - mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host(), needSize, MNNMemcpyHostToDevice, + if (!dstDevice) { + auto cpuSize = dstTensor->size(); + mCUDARuntime->memcpy(dstTensor->host(), dstPtr, cpuSize, MNNMemcpyDeviceToHost, true); + mStaticBufferPool->free(tempDstStorage); } return; } diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp index 86e01a5c..4fb0c50b 100644 --- a/source/backend/cuda/core/CUDABackend.hpp +++ b/source/backend/cuda/core/CUDABackend.hpp @@ -17,6 +17,7 @@ #include "core/Macro.h" #include "core/ConvolutionCommon.hpp" #include "core/BufferAllocator.hpp" +#include "backend/cpu/CPUResizeCache.hpp" namespace MNN { namespace CUDA { class MNN_PUBLIC CUDARuntimeWrapper : public Runtime { @@ -37,11 +38,12 @@ private: std::shared_ptr mBufferPool; std::shared_ptr mCUDARuntime; bool mIsCreateError{false}; + BackendConfig::PrecisionMode mDefaultPrecision; }; class CUDABackend : public Backend { public: - CUDABackend(std::shared_ptr st, std::shared_ptr rt); + CUDABackend(std::shared_ptr st, std::shared_ptr rt, bool useFp16AsFp32); ~CUDABackend(); CUDARuntime *getCUDARuntime(); @@ -74,11 +76,15 @@ public: return mStaticBufferPool.get(); } static size_t realSize(const Tensor *tensor); - + int getBytes(const Tensor* tensor) const; + CPUResizeCache* getCache(); + bool useFp16() const; private: std::shared_ptr mBufferPool; std::shared_ptr mStaticBufferPool; std::shared_ptr mCUDARuntime; + CPUResizeCache mCache; + bool mUseFp16AsFp32 = false; }; template diff --git a/source/backend/cuda/core/runtime/CUDARuntime.cpp b/source/backend/cuda/core/runtime/CUDARuntime.cpp index 10b17da1..72c78fd8 100644 --- a/source/backend/cuda/core/runtime/CUDARuntime.cpp +++ b/source/backend/cuda/core/runtime/CUDARuntime.cpp @@ -15,17 +15,11 @@ #include #include #include "core/Macro.h" +// #define MNN_CUDA_USE_BLAS //#define MNN_OPEN_TIME_TRACE #include #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) -// #define LOG_VERBOSE -#define CUDNN_VERSION_STR STR(CUDNN_MAJOR) "." STR(CUDNN_MINOR) "." STR(CUDNN_PATCHLEVEL) - -#pragma message "compile with cuda " STR(CUDART_VERSION) " " -#pragma message "compile with cuDNN " CUDNN_VERSION_STR " " - -static_assert(!(CUDNN_MAJOR == 5 && CUDNN_MINOR == 1), "cuDNN 5.1.x series has bugs. Use 5.0.x instead."); #undef STR #undef STR_HELPER @@ -36,7 +30,7 @@ bool CUDARuntime::isCreateError() const { return mIsCreateError; } -CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) { +CUDARuntime::CUDARuntime(int device_id) { #ifdef LOG_VERBOSE MNN_PRINT("start CUDARuntime !\n"); #endif @@ -49,42 +43,39 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) { mDeviceId = id; cuda_check(cudaGetDeviceProperties(&mProp, id)); MNN_ASSERT(mProp.maxThreadsPerBlock > 0); - +#ifdef MNN_CUDA_USE_BLAS cublas_check(cublasCreate(&mCublasHandle)); - - // Set stream for cuDNN and cublas handles. - - // Note that all cublas scalars (alpha, beta) and scalar results such as dot - // output resides at device side. cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST)); - cudnn_check(cudnnCreate(&mCudnnHandle)); +#endif } CUDARuntime::~CUDARuntime() { #ifdef LOG_VERBOSE MNN_PRINT("start ~CUDARuntime !\n"); #endif +#ifdef MNN_CUDA_USE_BLAS cublas_check(cublasDestroy(mCublasHandle)); - cudnn_check(cudnnDestroy(mCudnnHandle)); - +#endif #ifdef LOG_VERBOSE MNN_PRINT("end ~CUDARuntime !\n"); #endif } -int CUDARuntime::blocks_num(const int total_threads) { - int maxNum = mProp.maxThreadsPerBlock; - if(total_threads / 32 > maxNum) { - mThreadPerBlock = maxNum; - } else if(total_threads / 16 > maxNum) { - mThreadPerBlock = maxNum / 2; - } else if(total_threads / 8 > maxNum) { - mThreadPerBlock = maxNum / 4; - } else if(total_threads / 4 > maxNum) { - mThreadPerBlock = maxNum / 8; - } else { - mThreadPerBlock = 128; - } +size_t CUDARuntime::blocks_num(const size_t total_threads) { + // size_t maxNum = mProp.maxThreadsPerBlock; + // if(total_threads / 32 > maxNum) { + // mThreadPerBlock = maxNum; + // } else if(total_threads / 16 > maxNum) { + // mThreadPerBlock = maxNum / 2; + // } else if(total_threads / 8 > maxNum) { + // mThreadPerBlock = maxNum / 4; + // } else if(total_threads / 4 > maxNum) { + // mThreadPerBlock = maxNum / 8; + // } else { + // mThreadPerBlock = 128; + // } + + mThreadPerBlock = 128; return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock; } @@ -148,13 +139,4 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) { cuda_check(cudaMemset(dst, value, size_in_bytes)); } - -cublasHandle_t CUDARuntime::cublas_handle() { - return mCublasHandle; -} - -cudnnHandle_t CUDARuntime::cudnn_handle() { - return mCudnnHandle; -} - } // namespace MNN diff --git a/source/backend/cuda/core/runtime/CUDARuntime.hpp b/source/backend/cuda/core/runtime/CUDARuntime.hpp index 1594ed60..f217c031 100644 --- a/source/backend/cuda/core/runtime/CUDARuntime.hpp +++ b/source/backend/cuda/core/runtime/CUDARuntime.hpp @@ -16,19 +16,14 @@ #include #include -#include #include #include -#include #include #include #include #include #include "Type_generated.h" #include "core/Macro.h" -#if CUDA_VERSION >= 10010 -#include -#endif typedef enum { CUDA_FLOAT32 = 0, @@ -49,40 +44,30 @@ typedef enum { } \ } while (0) -#define cublas_check(_x) \ - do { \ - cublasStatus_t _err = (_x); \ - if (_err != CUBLAS_STATUS_SUCCESS) { \ - MNN_CHECK(_err, #_x); \ - } \ - } while (0) - -#define cudnn_check(_x) \ - do { \ - cudnnStatus_t _err = (_x); \ - if (_err != CUDNN_STATUS_SUCCESS) { \ - MNN_CHECK(_err, #_x); \ - } \ - } while (0) - -#define cusolver_check(_x) \ - do { \ - cusolverStatus_t _err = (_x); \ - if (_err != CUSOLVER_STATUS_SUCCESS) { \ - MNN_CHECK(_err, #_x); \ - } \ - } while (0) - #define after_kernel_launch() \ do { \ cuda_check(cudaGetLastError()); \ } while (0) +#ifdef DEBUG +#define checkKernelErrors\ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("File:%s Line %d: failed: %s\n", __FILE__, __LINE__,\ + cudaGetErrorString(__err)); \ + abort(); \ + } \ + } while (0) +#else +#define checkKernelErrors +#endif + namespace MNN { class CUDARuntime { public: - CUDARuntime(bool permitFloat16, int device_id); + CUDARuntime(int device_id); ~CUDARuntime(); CUDARuntime(const CUDARuntime &) = delete; CUDARuntime &operator=(const CUDARuntime &) = delete; @@ -105,16 +90,14 @@ public: void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false); void memset(void *dst, int value, size_t size_in_bytes); - cublasHandle_t cublas_handle(); - cudnnHandle_t cudnn_handle(); - int threads_num() { + size_t threads_num() { return mThreadPerBlock; } int major_sm() const { return mProp.major; } - int blocks_num(const int total_threads); + size_t blocks_num(const size_t total_threads); const cudaDeviceProp& prop() const { return mProp; } @@ -123,15 +106,12 @@ private: cudaDeviceProp mProp; int mDeviceId; - cublasHandle_t mCublasHandle; - cudnnHandle_t mCudnnHandle; - bool mIsSupportedFP16 = false; bool mSupportDotInt8 = false; bool mSupportDotAccInt8 = false; float mFlops = 4.0f; bool mIsCreateError{false}; - int mThreadPerBlock = 128; + size_t mThreadPerBlock = 128; }; } // namespace MNN diff --git a/source/backend/cuda/execution/BatchMatMulExecution.cu b/source/backend/cuda/execution/BatchMatMulExecution.cu deleted file mode 100644 index dc6d235a..00000000 --- a/source/backend/cuda/execution/BatchMatMulExecution.cu +++ /dev/null @@ -1,119 +0,0 @@ -#include "BatchMatMulExecution.hpp" -namespace MNN { -namespace CUDA { - -template -__global__ void add_bias(T *input, T *output, const T* bias, int batch, int e, int h) { - for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) { - int i = index % (e*h); - int b = index / (e*h); - int y = i % h; - output[index] = input[index] + bias[b * h + y]; - } - return; -} -BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) { - mTransposeA = transposeA; - mTransposeB = transposeB; -} -BatchMatMulExecution::~ BatchMatMulExecution() { - // do nothing -} - -ErrorCode BatchMatMulExecution::onResize(const std::vector &inputs, const std::vector &outputs) { - auto C = outputs[0]; - - auto dimensions = C->dimensions(); - int batch = 1; - for (int i = 0; i < dimensions - 2; ++i) { - batch *= C->length(i); - } - auto e = C->length(dimensions-2); - auto h = C->length(dimensions-1); - if(inputs.size() > 2) { - mTempOutput.reset(Tensor::createDevice({batch*h*e})); - auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC); - if (!res) { - return OUT_OF_MEMORY; - } - backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC); - } - return NO_ERROR; -} - -ErrorCode BatchMatMulExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { - auto runtime = static_cast(backend())->getCUDARuntime(); - auto blasHandle = runtime->cublas_handle(); - const Tensor* A = inputs[0]; - const Tensor* B = inputs[1]; - - auto dimensions = A->dimensions(); - int batch = 1; - for (int i = 0; i < dimensions - 2; ++i) { - batch *= A->length(i); - } - - auto w0 = inputs[0]->length(dimensions-1); - auto h0 = inputs[0]->length(dimensions-2); - auto C = outputs[0]; - - auto e = C->length(dimensions-2); - auto h = C->length(dimensions-1); - auto l = w0; - if (mTransposeA) { - l = h0; - } - auto APtr = (const float*)A->deviceId(); - auto BPtr = (const float*)B->deviceId(); - auto CDestPtr = (float*)C->deviceId(); - - float alpha = 1.0f; - float beta = 0.0f; - - auto tranB = CUBLAS_OP_N; - auto ldB = h; - if (mTransposeB) { - ldB = l; - tranB = CUBLAS_OP_T; - } - auto tranA = CUBLAS_OP_N; - auto ldA = l; - if (mTransposeA) { - ldA = e; - tranA = CUBLAS_OP_T; - } - - // [b, e, l] x [b, l, h] -> [b, e, h] - if(inputs.size() == 2) { - auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch); - cublas_check(status); - //cudaThreadSynchronize(); - - } else { - auto CPtr = (float*)mTempOutput->deviceId(); - auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch); - cublas_check(status); - //cudaThreadSynchronize(); - - //add bias: [b, e, h] + [b, h] -> [b, e, h] - int block_num = runtime->blocks_num(batch*e*h); - int threads_num = runtime->threads_num(); - add_bias<<>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h); - } - - return NO_ERROR; -} - -class BatchMatMulCreator : public CUDABackend::Creator { -public: - virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { - auto param = op->main_as_BatchMatMulParam(); - return new BatchMatMulExecution(param->adjX(), param->adjY(), backend); - } -}; - -static CUDACreatorRegister __init(OpType_BatchMatMul); - -} -} diff --git a/source/backend/cuda/execution/BatchMatMulExecution.hpp b/source/backend/cuda/execution/BatchMatMulExecution.hpp deleted file mode 100644 index d3630d1b..00000000 --- a/source/backend/cuda/execution/BatchMatMulExecution.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef BatchMatMulExecution_hpp -#define BatchMatMulExecution_hpp -#include -#include "backend/cuda/core/CUDABackend.hpp" -#include "core/Execution.hpp" -namespace MNN { -namespace CUDA { -class BatchMatMulExecution : public Execution { -public: - BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend); - virtual ~BatchMatMulExecution(); - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - -private: - std::shared_ptr mTempOutput; - bool mTransposeA; - bool mTransposeB; -}; -} // namespace CUDA -} // namespace MNN - -#endif diff --git a/source/backend/cuda/execution/BinaryExecution.cu b/source/backend/cuda/execution/BinaryExecution.cu index 77005f76..8f0ec238 100644 --- a/source/backend/cuda/execution/BinaryExecution.cu +++ b/source/backend/cuda/execution/BinaryExecution.cu @@ -50,11 +50,16 @@ ErrorCode BinaryExecution::onExecute(const std::vector &inputs, const int stride0[3] = {0, 0, s0}; int stride1[3] = {0, 0, s1}; int stride2[3] = {0, 0, 1}; + auto type = outputs[0]->getType(); + if (type.code == halide_type_float) { + // Use Half or float + type.bits = static_cast(backend())->getBytes(inputs[0]) * 8; + } auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) { auto input0 = (uint8_t*)input0T->deviceId(); auto input1 = (uint8_t*)input1T->deviceId(); auto output = (uint8_t*)outputT->deviceId(); - BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, outputT->getType(), runtime, mType); + BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType); }; computeFunction(inputs[0], inputs[1], outputs[0]); for (int i=2; i +#include "MNNCUDADefine.hpp" +#include "MNNCUDAFunction.cuh" + namespace MNN { namespace CUDA { -struct constBuffer { - int pad[2]; - int kernelSize[2]; - int stride[2]; - int dilate[2]; - int inputSize[2]; - int outputSize[2]; - int channel; - int subChannel; - int total; - int activationType; -} uConstant; +#define PACK_NUMBER_C2 (PACK_NUMBER/2) -ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn) : Execution(bn) { +#define MNN_CUDA_HALF2_MAX(a, b) \ + do { \ + (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \ + (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \ + } while (0) + +#define MNN_CUDA_HALF2_MIN(a, b) \ + do { \ + (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \ + (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \ + } while (0) + + +__global__ void CONV_DW_HALF(const half2* input, const half2* kernel, const half2* bias, half2 *output, const constBuffer* uConstant) { + half2 maxV = half2(uConstant->maxValue, uConstant->maxValue); + half2 minV = half2(uConstant->minValue, uConstant->minValue); + int iw = uConstant->inputSize[0]; + int ih = uConstant->inputSize[1]; + int c = uConstant->channel; + int ow = uConstant->outputSize[0]; + int oh = uConstant->outputSize[1]; + int kw = uConstant->kernelSize[0]; + int kh = uConstant->kernelSize[1]; + int dw = uConstant->dilate[0]; + int dh = uConstant->dilate[1]; + int sw = uConstant->stride[0]; + int sh = uConstant->stride[1]; + int pw = uConstant->pad[0]; + int ph = uConstant->pad[1]; + + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) { + int i = index / PACK_NUMBER_C2; + int zR = index % PACK_NUMBER_C2; + int oz = i / (ow * oh); + int tmp = i % (ow * oh); + int oy = tmp / ow; + int ox = tmp % ow; + int kz = oz / uConstant->batch; + + int ix = ox * sw - pw; + int iy = oy * sh - ph; + half2 color = bias[kz * PACK_NUMBER_C2 + zR]; + int fxSta = max(0, (UP_DIV(-ix, dw))); + int fySta = max(0, (UP_DIV(-iy, dh))); + int fxEnd = min(kw, UP_DIV(iw - ix, dw)); + int fyEnd = min(kh, UP_DIV(ih - iy, dh)); + int fx, fy, fz; + for (fy=fySta; fymaxValue; + float minV = uConstant->minValue; + int iw = uConstant->inputSize[0]; + int ih = uConstant->inputSize[1]; + int c = uConstant->channel; + int ow = uConstant->outputSize[0]; + int oh = uConstant->outputSize[1]; + int kw = uConstant->kernelSize[0]; + int kh = uConstant->kernelSize[1]; + int dw = uConstant->dilate[0]; + int dh = uConstant->dilate[1]; + int sw = uConstant->stride[0]; + int sh = uConstant->stride[1]; + int pw = uConstant->pad[0]; + int ph = uConstant->pad[1]; + + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) { + int i = index / PACK_NUMBER; + int zR = index % PACK_NUMBER; + int oz = i / (ow * oh); + int tmp = i % (ow * oh); + int oy = tmp / ow; + int ox = tmp % ow; + int kz = oz / uConstant->batch; + + int ix = ox * sw - pw; + int iy = oy * sh - ph; + float color = bias[kz * PACK_NUMBER + zR]; + int fxSta = max(0, (UP_DIV(-ix, dw))); + int fySta = max(0, (UP_DIV(-iy, dh))); + int fxEnd = min(kw, UP_DIV(iw - ix, dw)); + int fyEnd = min(kh, UP_DIV(ih - iy, dh)); + int fx, fy, fz; + for (fy=fySta; fymaxValue; + float minV = uConstant->minValue; + int iw = uConstant->inputSize[0]; + int ih = uConstant->inputSize[1]; + int kw = uConstant->kernelSize[0]; + int kh = uConstant->kernelSize[1]; + int sw = uConstant->stride[0]; + int sh = uConstant->stride[1]; + int pw = uConstant->pad[0]; + int ph = uConstant->pad[1]; + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) { + int i = index >> 4; + int zR = index & 15; + int oz, tmp, oy, ox, kz, unuse; + d_owh.divmod(i, oz, tmp); + d_ow.divmod(tmp, oy, ox); + d_ob.divmod(oz, kz, unuse); + + int ix = ox * sw - pw; + int iy = oy * sh - ph; + float color = bias[(kz << 4) + zR]; + int fxSta = max(0, -ix); + int fySta = max(0, -iy); + int fxEnd = min(kw, iw - ix); + int fyEnd = min(kh, ih - iy); + int fx, fy, fz; + for (fy=fySta; fy _makeResource(const Op* op, Backend* bn) { + std::shared_ptr res(new ConvDepthWiseExecution::Resource); + auto pool = static_cast(bn)->getStaticBufferPool(); + auto runtime = static_cast(bn)->getCUDARuntime(); + auto conv = op->main_as_Convolution2D(); + auto convCommon = conv->common(); + int kernelX = convCommon->kernelX(); + int kernelY = convCommon->kernelY(); + int depth = convCommon->outputCount(); + int depthC = UP_DIV(depth, PACK_NUMBER); + res->weightTensor.reset(Tensor::createDevice({kernelX * kernelY * depthC * PACK_NUMBER})); + bool success = bn->onAcquireBuffer(res->weightTensor.get(), Backend::STATIC); + if (!success) { + return nullptr; + } + res->mFilter = (void *)res->weightTensor.get()->buffer().device; + FuseRegion reg; + int offset[8 * PACK_NUMBER]; + auto regionStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion)); + auto offsetGpuStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(offset)); + auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; + //weight host->device + const float* filterDataPtr = nullptr; + int weightSize = 0; + std::shared_ptr quanCommon; + ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize); + auto tempWeightStorage = pool->alloc(weightSize * sizeof(float)); + auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second; + cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice)); + reg.size[0] = 1; + reg.size[1] = depthC; + reg.size[2] = kernelX * kernelY; + reg.srcStride[0] = 0; + reg.srcStride[1] = PACK_NUMBER * kernelX * kernelY; + reg.srcStride[2] = 1; + reg.dstStride[0] = 0; + reg.dstStride[1] = kernelX * kernelY * PACK_NUMBER; + reg.dstStride[2] = PACK_NUMBER; + reg.fuseNumber = PACK_NUMBER; + for (int v=0; vmemcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + runtime->memcpy(offsetGpu, offset, 8 * PACK_NUMBER * sizeof(int), MNNMemcpyHostToDevice, true); + FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + pool->free(tempWeightStorage); + res->biasTensor.reset(Tensor::createDevice({depthC * PACK_NUMBER})); + success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC); + res->mBias = (void *)res->biasTensor.get()->buffer().device; + if (!success) { + return nullptr; + } + if(conv->bias() != nullptr) { + auto tempBiasStorage = pool->alloc(depth * sizeof(float)); + auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second; + cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + reg.size[0] = 1; + reg.size[1] = 1; + reg.size[2] = depthC * PACK_NUMBER; + reg.srcStride[0] = 0; + reg.srcStride[1] = 0; + reg.srcStride[2] = 1; + reg.dstStride[0] = 0; + reg.dstStride[1] = 0; + reg.dstStride[2] = 1; + offset[0] = 1; + offset[1] = 1; + offset[2] = conv->bias()->size(); + offset[3] = 0; + offset[4] = 1; + offset[5] = 1; + offset[6] = reg.size[2]; + offset[7] = 0; + reg.fuseNumber = 1; + runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true); + FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + pool->free(tempBiasStorage); + } + static_cast(bn)->getStaticBufferPool()->free(regionStorage); + static_cast(bn)->getStaticBufferPool()->free(offsetGpuStorage); + return res; +} + +ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn, std::shared_ptr resource) : Execution(bn) { mOp = op; + mResource = resource; auto pool = static_cast(bn)->getStaticBufferPool(); mConstBuffer = pool->alloc(sizeof(constBuffer)); - - auto conv = mOp->main_as_Convolution2D(); - //weight host->device - if(nullptr != conv->weight()) { - int weightSize = conv->weight()->size(); - weightTensor.reset(Tensor::createDevice({weightSize})); - backend()->onAcquireBuffer(weightTensor.get(), Backend::STATIC); - mFilter = (void *)weightTensor.get()->buffer().device; - cuda_check(cudaMemcpy(mFilter, conv->weight()->data(), conv->weight()->size()*sizeof(float), cudaMemcpyHostToDevice)); - - mBias = nullptr; - if(conv->bias()->size() != 0) { - int biasSize = conv->bias()->size(); - biasTensor.reset(Tensor::createDevice({biasSize})); - backend()->onAcquireBuffer(biasTensor.get(), Backend::STATIC); - mBias = (void *)biasTensor.get()->buffer().device; - cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); - use_bias_ = true; - } - } } ConvDepthWiseExecution::~ ConvDepthWiseExecution() { auto pool = static_cast(backend())->getStaticBufferPool(); pool->free(mConstBuffer); - if (nullptr != weightTensor) { - backend()->onReleaseBuffer(weightTensor.get(), Backend::STATIC); - } - if(use_bias_ && nullptr != biasTensor) { - backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC); - } } ErrorCode ConvDepthWiseExecution::onResize(const std::vector &inputs, const std::vector &outputs) { auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mOp->main_as_Convolution2D()->common()); auto conv = mOp->main_as_Convolution2D(); auto convCommon = mOp->main_as_Convolution2D()->common(); - constBuffer parameters; + int channel = inputs[0]->channel(); + int channelDiv = UP_DIV(channel, PACK_NUMBER); parameters.pad[0] = pad.first; parameters.pad[1] = pad.second; parameters.kernelSize[0] = convCommon->kernelX(); @@ -66,233 +329,82 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector &inputs, parameters.dilate[1] = convCommon->dilateY(); parameters.inputSize[0] = inputs[0]->width(); parameters.inputSize[1] = inputs[0]->height(); - parameters.channel = inputs[0]->batch() * inputs[0]->channel(); + parameters.channel = inputs[0]->batch() * channelDiv; parameters.outputSize[0] = outputs[0]->width(); parameters.outputSize[1] = outputs[0]->height(); - parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0]; - parameters.subChannel = inputs[0]->channel(); - parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0); + if (static_cast(backend())->useFp16()) { + parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER_C2; + } else { + parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER; + parameters.minValue = -FLT_MAX; + parameters.maxValue = FLT_MAX; + } + parameters.batch = inputs[0]->batch(); + if (convCommon->relu()) { + parameters.minValue = 0.0f; + } + if (convCommon->relu6()) { + parameters.minValue = 0.0f; + parameters.maxValue = 6.0f; + } auto runtime = static_cast(backend())->getCUDARuntime(); runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice); mTotalCount = parameters.total; - + //printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]); return NO_ERROR; } -__global__ void CONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) { - { - int iw = uConstant->inputSize[0]; - int ih = uConstant->inputSize[1]; - int c = uConstant->channel; - int ow = uConstant->outputSize[0]; - int oh = uConstant->outputSize[1]; - int kw = uConstant->kernelSize[0]; - int kh = uConstant->kernelSize[1]; - int dw = uConstant->dilate[0]; - int dh = uConstant->dilate[1]; - int sw = uConstant->stride[0]; - int sh = uConstant->stride[1]; - int pw = uConstant->pad[0]; - int ph = uConstant->pad[1]; - int acttype = uConstant->activationType; - - int oz = i / (ow * oh); - int tmp = i % (ow * oh); - int oy = tmp / ow; - int ox = tmp % ow; - int kz = oz % uConstant->subChannel; - - int ix = ox * sw - pw; - int iy = oy * sh - ph; - float color = 0.0; - if (bias != nullptr) { - color = bias[kz]; - } - - int fx, fy, fz; - for (fy=0; fy= ih || sy < 0) { - continue; - } - for (fx=0; fx= iw || sx < 0) { - continue; - } - float inputValue = input[0 - + sx - + sy * iw - + oz * iw * ih - ]; - float k = kernel[0 - + fx - + fy * kw - + kz * kw * kh - ]; - color += k*inputValue; - } - } - color = (acttype==1) ? max(0.0, color) : (acttype==2 ? (min(max(0.0, color), 6.0)) : color); - output[0 - + ox - + oy * ow - + oz * ow * oh - ] = color; - } - } - return; -} - - ErrorCode ConvDepthWiseExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { auto runtime = static_cast(backend())->getCUDARuntime(); auto& prop = runtime->prop(); - int threads_num = prop.maxThreadsPerBlock; + int limitThreads = UP_DIV(mTotalCount, prop.multiProcessorCount); + int threads_num = ALIMIN(prop.maxThreadsPerBlock, limitThreads); int block_num = prop.multiProcessorCount; auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second; - if (inputs.size() == 1) { - CONV_DW<<>>((const float*)inputs[0]->deviceId(), (const float*)mFilter, - (const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr)); - } else if (inputs.size() == 3) { - CONV_DW<<>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(), - (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr); - } else { - MNN_ASSERT(inputs.size() == 2); - CONV_DW<<>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(), - nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr); + if (static_cast(backend())->useFp16()) { + if (inputs.size() == 1) { + CONV_DW_HALF<<>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter, + (const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr)); + } + return NO_ERROR; } - return NO_ERROR; -} - - -__global__ void DECONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) { - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) { - { - int iw = uConstant->inputSize[0]; - int ih = uConstant->inputSize[1]; - int c = uConstant->channel; - int ow = uConstant->outputSize[0]; - int oh = uConstant->outputSize[1]; - int kw = uConstant->kernelSize[0]; - int kh = uConstant->kernelSize[1]; - int dw = uConstant->dilate[0]; - int dh = uConstant->dilate[1]; - int sw = uConstant->stride[0]; - int sh = uConstant->stride[1]; - int pw = uConstant->pad[0]; - int ph = uConstant->pad[1]; - - int oz = i / (ow * oh); - int tmp = i % (ow * oh); - int oy = tmp / ow; - int ox = tmp % ow; - int kz = oz % uConstant->subChannel; + if (inputs.size() == 1) { + // block_num = runtime->blocks_num(mTotalCount); + // threads_num = runtime->threads_num(); + if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) { + const int area = parameters.outputSize[0] * parameters.outputSize[1]; + DivModFast d_owh(area); + DivModFast d_ow(parameters.outputSize[0]); + DivModFast d_ob(outputs[0]->batch()); - int ix = ox + pw; - int iy = oy + ph; - float color = 0.0; - if (bias != nullptr) { - color = bias[kz]; - } - - int fx, fy, fz; - for (fy=0; fy= 0 && y < ih) { - for (int fx=0; fx= 0 && x < iw) { - float inputValue = input[0 - + x - + y * iw - + oz * iw * ih - ]; - float k = kernel[0 - + fx - + fy * kw - + kz * kw * kh - ]; - color += k*inputValue; - } - } - } - } - output[0 - + ox - + oy * ow - + oz * ow * oh - ] = color; + CONV_DW_OPT<<>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter, + (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr), + d_owh, d_ow, d_ob); + } else { + CONV_DW<<>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter, + (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr)); } } - return; -} - - -ErrorCode DeconvDepthWiseExecution::onResize(const std::vector &inputs, const std::vector &outputs) { - auto convCommon = mOp->main_as_Convolution2D()->common(); - auto pad = ConvolutionCommon::convolutionTransposePad(inputs[0], outputs[0], convCommon); - constBuffer parameters; - parameters.pad[0] = pad.first; - parameters.pad[1] = pad.second; - parameters.kernelSize[0] = convCommon->kernelX(); - parameters.kernelSize[1] = convCommon->kernelY(); - parameters.stride[0] = convCommon->strideX(); - parameters.stride[1] = convCommon->strideY(); - parameters.dilate[0] = convCommon->dilateX(); - parameters.dilate[1] = convCommon->dilateY(); - parameters.inputSize[0] = inputs[0]->width(); - parameters.inputSize[1] = inputs[0]->height(); - parameters.channel = inputs[0]->batch() * inputs[0]->channel(); - parameters.outputSize[0] = outputs[0]->width(); - parameters.outputSize[1] = outputs[0]->height(); - parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0]; - parameters.subChannel = inputs[0]->channel(); - auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second; - - auto runtime = static_cast(backend())->getCUDARuntime(); - runtime->memcpy(constPtr, ¶meters, sizeof(constBuffer), MNNMemcpyHostToDevice); - mTotalCount = parameters.total; return NO_ERROR; } -ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { - auto runtime = static_cast(backend())->getCUDARuntime(); - int block_num = runtime->blocks_num(mTotalCount); - int threads_num = runtime->threads_num(); - auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second; - if (inputs.size() > 2) { - DECONV_DW<<>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(), - (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr); - } else { - DECONV_DW<<>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(), - nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr); - } - return NO_ERROR; -} - - class ConvDepthWiseExecutionCreator : public CUDABackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { - if (OpType_ConvolutionDepthwise == op->type()) { - return new ConvDepthWiseExecution(op, backend); - } - if (inputs.size() == 1) { - MNN_PRINT("deconv depthwise not support 1 input yet\n"); + if (inputs.size() > 1) { return nullptr; } - return new DeconvDepthWiseExecution(op, backend); + auto res = _makeResource(op, backend); + if (nullptr == res) { + return nullptr; + } + return new ConvDepthWiseExecution(op, backend, res); } }; static CUDACreatorRegister __init(OpType_ConvolutionDepthwise); -static CUDACreatorRegister __init2(OpType_DeconvolutionDepthwise); } } \ No newline at end of file diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp index 35ebcbb0..5bce3f72 100644 --- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp +++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp @@ -14,9 +14,30 @@ #include "core/Execution.hpp" namespace MNN { namespace CUDA { + +struct constBuffer { + int pad[2]; + int kernelSize[2]; + int stride[2]; + int dilate[2]; + int inputSize[2]; + int outputSize[2]; + int channel; + int total; + int batch; + float minValue = -65504.0f; + float maxValue = 65504.0f; +} uConstant; + class ConvDepthWiseExecution : public Execution { public: - ConvDepthWiseExecution(const Op *op, Backend *bn); + struct Resource { + std::shared_ptr weightTensor; + std::shared_ptr biasTensor; + void* mFilter; + void* mBias; + }; + ConvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr resource); virtual ~ConvDepthWiseExecution(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; @@ -25,17 +46,13 @@ protected: std::pair mConstBuffer; const Op *mOp; int mTotalCount; - - void* mFilter; - void* mBias; - std::shared_ptr weightTensor; - std::shared_ptr biasTensor; - bool use_bias_=false; + constBuffer parameters; + std::shared_ptr mResource; }; class DeconvDepthWiseExecution : public ConvDepthWiseExecution { public: - DeconvDepthWiseExecution(const Op *op, Backend *bn) : ConvDepthWiseExecution(op, bn) { + DeconvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr resource) : ConvDepthWiseExecution(op, bn, resource) { // Do nothing } virtual ~DeconvDepthWiseExecution() { diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.cu b/source/backend/cuda/execution/ConvSingleInputExecution.cu index a022f0cb..1653e787 100644 --- a/source/backend/cuda/execution/ConvSingleInputExecution.cu +++ b/source/backend/cuda/execution/ConvSingleInputExecution.cu @@ -7,55 +7,52 @@ // #include "ConvSingleInputExecution.hpp" +#include "Raster.cuh" +#include "MNNCUDADefine.hpp" +#include "MNNCUDAFunction.cuh" +// 16 / sizeof(int4) namespace MNN { namespace CUDA { -__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param, - const MatMulParam* matmulParam, - const float* A, - __half* AP) { - int eAlign = matmulParam->elhPack[0] * MATMULPACK; - int lAlign = matmulParam->elhPack[1] * MATMULPACK; - int maxCount = eAlign * lAlign; - int kernelCount = param->kernelX * param->kernelY; - for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { - int eIndex = index % eAlign; - int lIndex = index / eAlign; - // Compute for dest - int eU = eIndex / MATMULPACK; - int eR = eIndex % MATMULPACK; - int lU = lIndex / MATMULPACK; - int lR = lIndex % MATMULPACK; - auto dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lU * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR; - if (eIndex >= matmulParam->elh[0] || lIndex >= matmulParam->elh[1]) { - AP[dstOffset] = 0.0; +__global__ void KernelReorder(const float* B, half* BP, int kw, int kh, int ic, int oc, int ocPack) { + int icC4 = UP_DIV(ic, PACK_NUMBER); + int kernelCount = kw * kh; + int l = icC4 * kernelCount * PACK_NUMBER; + int h = oc; + int lDiv = UP_DIV(l, MATMULPACK); + int lAlign = lDiv * MATMULPACK; + int hAlign = UP_DIV(h, ocPack) * ocPack; + int maxCount = hAlign * lAlign; + + for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int lR = indexO % MATMULPACK; + int tmp = indexO / MATMULPACK; + int hR = tmp % ocPack; + int tmp2 = tmp / ocPack; + int lC = tmp2 % lDiv; + int hC = tmp2 / lDiv; + half* dst = BP + indexO; + int sH = hC * ocPack + hR; + int sL = lC * MATMULPACK + lR; + if (sH >= oc) { + *dst = 0.0; continue; } - // Compute for source - int ox = eIndex % param->ow; - int oy = eIndex / param->ow; - int ob = oy / param->oh; - oy = oy % param->oh; - int sz = lIndex / kernelCount; - int kI = lIndex % kernelCount; - int ksx = kI % param->kernelX; - int ksy = kI / param->kernelX; - - int sx = ox * param->strideX + ksx * param->dilateX - param->padX; - int sy = oy * param->strideY + ksy * param->dilateY - param->padY; - if (sx >= 0 && sx < param->iw) { - if (sy >=0 && sy < param->ih) { - __half value = A[sz * param->ih * param->iw + ob * param->iw * param->ih * param->icDiv4 + sy * param->iw + sx]; - AP[dstOffset] = value; - continue; - } + int sLR = sL % PACK_NUMBER; + int sLC = sL / PACK_NUMBER; + int iLC = sLC / (kernelCount); + int ik = sLC % kernelCount; + int iz = iLC * PACK_NUMBER + sLR; + if (iz >= ic) { + *dst = 0.0; + continue; } - AP[dstOffset] = 0.0; + const float* src = B + sH * kernelCount * ic + ik + iz * kernelCount; + *dst = *src; } } - ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { mBackend = bn; auto runtime = static_cast(bn)->getCUDARuntime(); @@ -78,40 +75,91 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize); mKernelInfo.kernelN = common->outputCount(); mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY; + int icDiv = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER); MatMulParam param; int e = 0; - int l = mKernelInfo.kernelX * mKernelInfo.kernelY * mKernelInfo.kernelC; + int l = mKernelInfo.kernelX * mKernelInfo.kernelY * icDiv * MATMULPACK; int h = mKernelInfo.kernelN; param.elh[0] = e; param.elh[1] = l; param.elh[2] = h; - param.elhPack[0] = UP_DIV(e, 16); - param.elhPack[1] = UP_DIV(l, 16); - param.elhPack[2] = UP_DIV(h, 16); + param.elhPack[0] = UP_DIV(e, MATMULPACK); + param.elhPack[1] = UP_DIV(l, MATMULPACK); + param.elhPack[2] = UP_DIV(h, MATMULPACK); param.bStride[0] = 0; param.bStride[1] = 1; param.bStride[2] = l; - auto gpuParam = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam)); - auto tempCacheBuffer = static_cast(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); - float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); - runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); - runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, ¶m, sizeof(MatMulParam), MNNMemcpyHostToDevice); + FuseRegion reg; + int maxOffsetNumber = 8; + std::vector offset(maxOffsetNumber); + auto regionStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion)); + auto offsetGpuStorage = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber); + auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second; + // Reorder weight - weightTensor.reset(Tensor::createDevice({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)})); - bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC); - mFilter = (void *)weightTensor.get()->buffer().device; - GemmPrepareRerange(runtime, ¶m, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), nullptr, nullptr, cacheWeight, (__half*)mFilter); - static_cast(bn)->getStaticBufferPool()->free(tempCacheBuffer); - static_cast(bn)->getStaticBufferPool()->free(gpuParam); + { + auto tempCacheBuffer = static_cast(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); + float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); + runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); + weightTensor.reset(Tensor::createDevice({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)})); + bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC); + mFilter = (void *)weightTensor.get()->buffer().device; + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + if (param.elhPack[2] % 2 == 0) { + KernelReorder<<>>((float*)cacheWeight, (half*)mFilter, + mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, 32); + mUsePack = true; + } else { + KernelReorder<<>>((float*)cacheWeight, (half*)mFilter, + mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, MATMULPACK); + } + static_cast(bn)->getStaticBufferPool()->free(tempCacheBuffer); + } // Copy Bias int biasSize = conv->bias()->size(); biasTensor.reset(Tensor::createDevice({biasSize})); bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC); + + auto tempBiasStorage = static_cast(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float)); + auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second); + cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + + // FP32 -> FP16 mBias = (void *)biasTensor.get()->buffer().device; - cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + int alignSize = UP_DIV(conv->bias()->size(), PACK_NUMBER) * PACK_NUMBER; + reg.size[0] = 1; + reg.size[1] = 1; + reg.size[2] = alignSize; + reg.srcStride[0] = 0; + reg.srcStride[1] = 0; + reg.srcStride[2] = 1; + reg.dstStride[0] = 0; + reg.dstStride[1] = 0; + reg.dstStride[2] = 1; + offset[0] = 1; + offset[1] = 1; + offset[2] = conv->bias()->size(); + offset[3] = 0; + offset[4] = 1; + offset[5] = 1; + offset[6] = reg.size[2]; + offset[7] = 0; + reg.fuseNumber = 1; + runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, ®, sizeof(FuseRegion), MNNMemcpyHostToDevice, true); + runtime->memcpy(offsetGpu, offset.data(), 8 * sizeof(int), MNNMemcpyHostToDevice, true); + if (static_cast(bn)->useFp16()) { + FuseRasterBlitFloatToHalf((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime); + } else { + FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4); + } + static_cast(bn)->getStaticBufferPool()->free(regionStorage); + static_cast(bn)->getStaticBufferPool()->free(offsetGpuStorage); + static_cast(bn)->getStaticBufferPool()->free(tempBiasStorage); } ConvSingleInputExecution::Resource::~Resource() { @@ -146,14 +194,16 @@ bool ConvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** ds ErrorCode ConvSingleInputExecution::onResize(const std::vector &inputs, const std::vector &outputs) { auto runtime = static_cast(backend())->getCUDARuntime(); auto input = inputs[0], output = outputs[0]; - const int UNIT = 1; + const int UNIT = PACK_NUMBER; auto convCommon = mOp->main_as_Convolution2D()->common(); auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common()); + int ic = input->channel(); + int icDiv = UP_DIV(ic, PACK_NUMBER); mIm2ColParamter.dilateX = convCommon->dilateX(); mIm2ColParamter.dilateY = convCommon->dilateY(); mIm2ColParamter.strideX = convCommon->strideX(); mIm2ColParamter.strideY = convCommon->strideY(); - mIm2ColParamter.icDiv4 = input->channel(); + mIm2ColParamter.icDiv4 = icDiv; mIm2ColParamter.kernelX = convCommon->kernelX(); mIm2ColParamter.kernelY = convCommon->kernelY(); mIm2ColParamter.padX = std::get<0>(pads); @@ -169,21 +219,21 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector &inputs, runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice); + //MNN_PRINT("conv size:%d-%d-%d, %d-%d-%d\n", input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel()); int e = output->height() * output->width() * output->batch(); - int l = input->channel() * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY; + int l = icDiv * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY * MATMULPACK; int h = output->channel(); mMatMulParam.elh[0] = e; mMatMulParam.elh[1] = l; mMatMulParam.elh[2] = h; - mMatMulParam.elhPack[0] = UP_DIV(e, 16); - mMatMulParam.elhPack[1] = UP_DIV(l, 16); - mMatMulParam.elhPack[2] = UP_DIV(h, 16); + mMatMulParam.elhPack[0] = UP_DIV(e, MATMULPACK); + mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK); + mMatMulParam.elhPack[2] = UP_DIV(h, MATMULPACK); mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h; mMatMulParam.cStride[1] = 1; mMatMulParam.cStride[2] = mIm2ColParamter.ow * mIm2ColParamter.oh; - mMatMulParam.split[0] = 1; - mMatMulParam.split[1] = 1; - mMatMulParam.split[2] = mIm2ColParamter.ow * mIm2ColParamter.oh; + mMatMulParam.minValue = -FLT_MAX; + mMatMulParam.maxValue = FLT_MAX; if (convCommon->relu()) { mMatMulParam.minValue = 0.0f; } @@ -191,12 +241,14 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector &inputs, mMatMulParam.minValue = 0.0f; mMatMulParam.maxValue = 6.0f; } + //MNN_PRINT("Im2Col temp size:%d!!!\n\n", mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK); runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice); auto pool = static_cast(backend())->getBufferPool(); - auto buffer = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK); + auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)MATMULPACK * (size_t)MATMULPACK); mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second); pool->free(buffer); + return NO_ERROR; } @@ -204,21 +256,28 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector &inputs //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_); MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); + auto input = inputs[0]; + auto output = outputs[0]; auto runtime = static_cast(backend())->getCUDARuntime(); + auto bytes = static_cast(backend())->getBytes(input); const void *input_addr = (const void*)inputs[0]->deviceId(); const void *filter_addr = mResource->mFilter; const void *bias_addr = mResource->mBias; - + auto bn = backend(); void *output_addr = (void*)outputs[0]->deviceId(); - auto& prop = runtime->prop(); - int threads_num = prop.maxThreadsPerBlock; - int cores = prop.multiProcessorCount; + auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second); auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second); - //runtime->memset(mIm2ColBuffer, 0, mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * sizeof(__half) * (MATMULPACK * MATMULPACK)); - Im2Col<<>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer); - GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const float*)bias_addr); + // Im2Col func + Im2ColMain(runtime, &mMatMulParam, gpuMatMul, &mIm2ColParamter, gpuIm2Col, (const float*)input_addr, mIm2ColBuffer, bytes); + + if (mResource->mUsePack) { + GemmPacked16x32(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes); + } else { + //printf("NotPack:%d-%d-%d-%d-%d, %d-%d-%d\n", mIm2ColParamter.icDiv4, mIm2ColParamter.ih, mIm2ColParamter.iw, mIm2ColParamter.oh, mIm2ColParamter.ow, mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]); + GemmPackedFullMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes); + } return NO_ERROR; } diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.hpp b/source/backend/cuda/execution/ConvSingleInputExecution.hpp index 2e70ce09..52c29aef 100644 --- a/source/backend/cuda/execution/ConvSingleInputExecution.hpp +++ b/source/backend/cuda/execution/ConvSingleInputExecution.hpp @@ -11,7 +11,9 @@ #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" -#include "TensorCoreGemm.cuh" +#include "TensorCoreGemmPacked.cuh" +#include "ImageColumn.cuh" + namespace MNN { namespace CUDA { @@ -40,6 +42,7 @@ public: std::shared_ptr biasTensor; KernelInfo mKernelInfo; Backend* mBackend = nullptr; + bool mUsePack = false; }; ConvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr res); virtual ~ConvSingleInputExecution(); @@ -58,6 +61,7 @@ private: std::pair mGpuIm2ColParam; __half* mIm2ColBuffer; + std::pair mGpuKernelParam; }; } // namespace CUDA diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.cu b/source/backend/cuda/execution/DeconvSingleInputExecution.cu index 73d2f98b..c2d87d32 100644 --- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu +++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu @@ -11,263 +11,302 @@ namespace MNN { namespace CUDA { -template -__global__ void cutPad(const size_t size, const T* input, const int old_height, - const int old_width, const int height, const int width, const int pad_top, - const int pad_left, T* output) { - for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) { - int block_num = pos / (width*height); - int left = pos % (width*height); - const int out_w = left % width; - const int out_h = left / width % height; +__global__ void DeconvInputRerange(const int count, + const InputReorderParameter* param, + const float* Inp, + __half* InpRe + ) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) { + int l = param->l_size; + int h = param->h_size; + int lIndex = i % l; + int hIndex = i / l; + int lU = lIndex / 16; + int lR = lIndex % 16; + int hU = hIndex / 16; + int hR = hIndex % 16; - output[pos] = input[(block_num * old_height + out_h + pad_top) * old_width + out_w + pad_left]; + int bIndex = hIndex / param->hw_size; + int hwIndex = hIndex % param->hw_size; + + float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex]; + //inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value; + + //__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16; + __half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16; + dst[0] = value; } - return; } -DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op) : Execution(backend), mOp(op) { - //MNN_PRINT("cuda DeconvSingleInput onInit in\n"); +template +__global__ void Col2Im(const int n, const Dtype* data_col, + const int batch, const int height, const int width, const int channels, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int height_col, const int width_col, + const Dtype* bias, Dtype* data_im) { + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) { + Dtype val = 0; + const int b_im = index / (channels * width * height); + const int chw = index % (channels * width * height); + const int w_im = chw % width + pad_w; + const int h_im = (chw / width) % height + pad_h; + const int c_im = chw / (width * height); + int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + // compute the start and end of the output + const int w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const int w_col_end = min(w_im / stride_w + 1, width_col); + const int h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const int h_col_end = min(h_im / stride_h + 1, height_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int h_k = (h_im - h_col * stride_h); + int w_k = (w_im - w_col * stride_w); + if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { + h_k /= dilation_h; + w_k /= dilation_w; + int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) * + height_col + h_col) * width_col + w_col; + val += data_col[data_col_index]; + } + } + } + + if(nullptr != bias) { + val += bias[c_im]; + } + data_im[index] = val; + } +} + + +DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) { + mBackend = bn; + auto runtime = static_cast(bn)->getCUDARuntime(); + auto conv = op->main_as_Convolution2D(); auto common = conv->common(); - - mKernelInfo.groups = common->group(); mKernelInfo.kernelX = common->kernelX(); mKernelInfo.kernelY = common->kernelY(); - mKernelInfo.padMode = common->padMode(); - mKernelInfo.padX = common->padX(); - mKernelInfo.padY = common->padY(); - - if (nullptr != common->pads()) { - mKernelInfo.padX = common->pads()->data()[1]; - mKernelInfo.padY = common->pads()->data()[0]; - } - pad_left_ = mKernelInfo.padX; - pad_right_ = mKernelInfo.padX; - pad_top_ = mKernelInfo.padY; - pad_bottom_ = mKernelInfo.padY; - + mKernelInfo.groups = common->group(); mKernelInfo.strideX = common->strideX(); mKernelInfo.strideY = common->strideY(); mKernelInfo.dilateX = common->dilateX(); mKernelInfo.dilateY = common->dilateY(); mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0); - use_relu_ = (mKernelInfo.activationType == 1); - use_relu6_ = (mKernelInfo.activationType == 2); - - cudnn_handle_ = nullptr; - input_desc_ = nullptr; - output_desc_ = nullptr; - filter_desc_ = nullptr; - conv_desc_ = nullptr; - padded_desc_ = nullptr; - cudnn_data_type_ = CUDNN_DATA_FLOAT; - cudnn_data_type_len_ = 0; - - auto runtime = static_cast(backend)->getCUDARuntime(); - cudnn_handle_ = runtime->cudnn_handle(); - cudnn_check(cudnnCreateTensorDescriptor(&input_desc_)); - cudnn_check(cudnnCreateTensorDescriptor(&output_desc_)); - cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_)); - cudnn_check(cudnnCreateTensorDescriptor(&bias_desc_)); - cudnn_check(cudnnCreateFilterDescriptor(&filter_desc_)); - cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc_)); - cudnn_check(cudnnCreateActivationDescriptor(&act_desc_)); - - //weight host->device const float* filterDataPtr = nullptr; int weightSize = 0; std::shared_ptr quanCommon; ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize); - weightTensor.reset(Tensor::createDevice({weightSize})); - backend->onAcquireBuffer(weightTensor.get(), Backend::STATIC); + mKernelInfo.kernelN = common->outputCount(); + mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY; + + MatMulParam param; + int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY; + int l = mKernelInfo.kernelC; + int h = 0; + param.elh[0] = e; + param.elh[1] = l; + param.elh[2] = h; + param.elhPack[0] = UP_DIV(e, 16); + param.elhPack[1] = UP_DIV(l, 16); + param.elhPack[2] = UP_DIV(h, 16); + + param.aStride[0] = 1; + param.aStride[1] = e; + param.aStride[2] = 0; + + auto gpuParam = static_cast(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam)); + auto tempCacheBuffer = static_cast(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float)); + float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second); + runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice); + runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, ¶m, sizeof(MatMulParam), MNNMemcpyHostToDevice); + + // Reorder weight + weightTensor.reset(Tensor::createDevice({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)})); + bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC); mFilter = (void *)weightTensor.get()->buffer().device; - cuda_check(cudaMemcpy(mFilter, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice)); + GemmPrepareRerange(runtime, ¶m, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4); + static_cast(bn)->getStaticBufferPool()->free(tempCacheBuffer); + static_cast(bn)->getStaticBufferPool()->free(gpuParam); + // Copy Bias + int biasSize = conv->bias()->size(); + biasTensor.reset(Tensor::createDevice({biasSize})); + bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC); + mBias = (void *)biasTensor.get()->buffer().device; + cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); + +} - if(conv->bias()->size() != 0) { - int biasSize = conv->bias()->size(); - biasTensor.reset(Tensor::createDevice({biasSize})); - backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC); - mBias = (void *)biasTensor.get()->buffer().device; - - cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice)); - - int bias_size = conv->bias()->size(); - int dim_bias[] = {1, bias_size, 1, 1}; - int stride_bias[] = {bias_size, 1, 1, 1}; - if(cudnn_data_type_ == CUDNN_DATA_FLOAT) { - cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias)); - } - else if(cudnn_data_type_ == CUDNN_DATA_HALF) { - cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_HALF, 4, dim_bias, stride_bias)); - } else { - MNN_PRINT("only supports fp32/fp16 data type!!!\n"); - } - use_bias_ = true; - } +DeconvSingleInputExecution::Resource::~Resource() { + // Do nothing +} +DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr res) : Execution(backend), mOp(op) { + mResource = res; + auto runtime = static_cast(backend)->getCUDARuntime(); + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam)); + mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter)); + mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter)); } DeconvSingleInputExecution::~DeconvSingleInputExecution() { - cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_)); - cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_)); - cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_)); - cudnn_check(cudnnDestroyTensorDescriptor(output_desc_)); - cudnn_check(cudnnDestroyTensorDescriptor(input_desc_)); - cudnn_check(cudnnDestroyTensorDescriptor(bias_desc_)); - cudnn_check(cudnnDestroyActivationDescriptor(act_desc_)); + auto staticPool = static_cast(backend())->getStaticBufferPool(); + staticPool->free(mGpuMatMulParam); + staticPool->free(mGpuCol2ImParam); + staticPool->free(mGpuInpReorderParam); +} +bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) { + if (!mValid) { + return false; + } + if (nullptr == dst) { + return true; + } + auto dstExe = new DeconvSingleInputExecution(bn, op, mResource); + *dst = dstExe; + return true; } + ErrorCode DeconvSingleInputExecution::onResize(const std::vector &inputs, const std::vector &outputs) { - // prepare - //MNN_PRINT("cuda DeconvSingleInput onResize in, pad:%d\n", mKernelInfo.padX); + auto runtime = static_cast(backend())->getCUDARuntime(); auto input = inputs[0], output = outputs[0]; + const int UNIT = 1; + auto convCommon = mOp->main_as_Convolution2D()->common(); - mIOInfo.iw = input->width(); - mIOInfo.ih = input->height(); - mIOInfo.ic = input->channel(); - mIOInfo.ib = input->batch(); - - mIOInfo.ow = output->width(); - mIOInfo.oh = output->height(); - mIOInfo.oc = output->channel(); - mIOInfo.ob = output->batch(); + // Input Rerange Param + mInpReorderParameter.hw_size = input->height() * input->width(); + mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size; + mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel(); + mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride; + mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size; + mInpReorderParameter.l_size = input->channel(); + mInpReorderParameter.h_size = input->batch() * mInpReorderParameter.hw_size; + mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16); + mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16); - mKernelInfo.kernelN = output->channel(); - mKernelInfo.kernelC = input->channel() / mKernelInfo.groups; + runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice); - std::vector in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw}; - std::vector output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow}; - std::vector filter_shape = {mKernelInfo.kernelC, mKernelInfo.kernelN, mKernelInfo.kernelY, mKernelInfo.kernelX};//deconv (ic oc kh kw) - - // printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]); - // printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]); - // printf("output:%d %d %d %d\n", output_shape[0], output_shape[1], output_shape[2], output_shape[3]); - cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0], - in_shape[1], in_shape[2], in_shape[3])); - - cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0], - filter_shape[1], filter_shape[2], filter_shape[3])); - cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0], - output_shape[1], output_shape[2], output_shape[3])); + // Col2Im Param + auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common()); + mCol2ImParamter.dilateX = convCommon->dilateX(); + mCol2ImParamter.dilateY = convCommon->dilateY(); + mCol2ImParamter.strideX = convCommon->strideX(); + mCol2ImParamter.strideY = convCommon->strideY(); + mCol2ImParamter.ic = input->channel(); + mCol2ImParamter.oc = output->channel(); + mCol2ImParamter.kernelX = convCommon->kernelX(); + mCol2ImParamter.kernelY = convCommon->kernelY(); + mCol2ImParamter.padX = pad.first; + mCol2ImParamter.padY = pad.second; + + mCol2ImParamter.ih = input->height(); + mCol2ImParamter.iw = input->width(); + mCol2ImParamter.oh = output->height(); + mCol2ImParamter.ow = output->width(); + mCol2ImParamter.ob = output->batch(); + + runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice); + + // Matmul Param + int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY; + int l = input->channel(); + int h = input->height() * input->width() * output->batch(); + + mMatMulParam.elh[0] = e; + mMatMulParam.elh[1] = l; + mMatMulParam.elh[2] = h; + mMatMulParam.elhPack[0] = UP_DIV(e, 16); + mMatMulParam.elhPack[1] = UP_DIV(l, 16); + mMatMulParam.elhPack[2] = UP_DIV(h, 16); + + mMatMulParam.bStride[0] = 0; + mMatMulParam.bStride[1] = input->height() * input->width(); + mMatMulParam.bStride[2] = 1; + + mMatMulParam.cStride[0] = h; + mMatMulParam.cStride[1] = 1; + mMatMulParam.cStride[2] = 1; + if (convCommon->relu()) { + mMatMulParam.minValue = 0.0f; + } + if (convCommon->relu6()) { + mMatMulParam.minValue = 0.0f; + mMatMulParam.maxValue = 6.0f; + } + runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice); + // Alloc temp cuda memory + auto pool = static_cast(backend())->getBufferPool(); + auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]); + auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK); - cudnnTensorDescriptor_t input_descriptor_real = nullptr; + mIm2ColBuffer = (float*)((uint8_t*)buffer1.first + buffer1.second); + mInputBuffer = (__half*)((uint8_t*)buffer2.first + buffer2.second); - if (mKernelInfo.padMode == PadMode_SAME) { - int kernelWidthSize = (mKernelInfo.kernelX - 1) * mKernelInfo.dilateX + 1; - int kernelHeightSize = (mKernelInfo.kernelY - 1) * mKernelInfo.dilateY + 1; - int pw = (mIOInfo.iw - 1) * mKernelInfo.strideX + kernelWidthSize - mIOInfo.ow; - int ph = (mIOInfo.ih - 1) * mKernelInfo.strideY + kernelHeightSize - mIOInfo.oh; - pad_left_ = pw/2; - pad_right_ = pw - pad_left_; - pad_top_ = ph/2; - pad_bottom_ = ph - pad_top_; - } + pool->free(buffer2); + pool->free(buffer1); - use_pad_ = (pad_left_!=0 || pad_right_!=0 || pad_top_!=0 || pad_bottom_!=0 ) ? true : false; - - if(use_pad_) { - int totalSize = output_shape[0]*output_shape[1]*(output_shape[2]+pad_top_+pad_bottom_)*(output_shape[3]+pad_left_+pad_right_); - padTensor.reset(Tensor::createDevice({totalSize})); - backend()->onAcquireBuffer(padTensor.get(), Backend::DYNAMIC); - mPadPtr = (void *)padTensor.get()->buffer().device; - - //dynamic memory release - backend()->onReleaseBuffer(padTensor.get(), Backend::DYNAMIC); - - cudnn_check(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0], output_shape[1], - output_shape[2] + +pad_top_+pad_bottom_, output_shape[3] + pad_left_+pad_right_)); - } - input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_; - - cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX, - mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); - if (cudnn_data_type_ == CUDNN_DATA_HALF) { - cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH)); - } - //set group num - cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups)); - - // algorithm - constexpr int requested_algo_count = 1; - int returned_algo_count; - cudnnConvolutionBwdDataAlgoPerf_t perf_results; - cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_, - output_desc_, requested_algo_count, &returned_algo_count, &perf_results)); - conv_bwd_algo_ = perf_results.algo; - - // workspace - cudnn_check(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_, output_desc_, - conv_bwd_algo_, &workspace_size_)); - - if (workspace_size_ != 0) { - int workspaceSize = workspace_size_; - workspaceTensor.reset(Tensor::createDevice({workspaceSize})); - //cudnn not support workspace memory reuse - backend()->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC); - mWorkSpace = (void *)workspaceTensor.get()->buffer().device; - } - - if(use_relu_) { - cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0)); - } else if(use_relu6_) { - cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_NOT_PROPAGATE_NAN, 6.0)); - } else { - //do nothing - } - //MNN_PRINT("cuda DeconvSingleInput onResize out\n"); return NO_ERROR; } ErrorCode DeconvSingleInputExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { - //MNN_PRINT("cuda DeconvSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_); - + //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_); MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); + auto bytes = static_cast(backend())->getBytes(inputs[0]); auto runtime = static_cast(backend())->getCUDARuntime(); const void *input_addr = (const void*)inputs[0]->deviceId(); - const void *filter_addr = mFilter; - const void *bias_addr = mBias; - + const void *filter_addr = mResource->mFilter; + const void *bias_addr = mResource->mBias; void *output_addr = (void*)outputs[0]->deviceId(); - void *workspace_addr = nullptr; - if (workspace_size_ != 0) { - workspace_addr = mWorkSpace; - } - const float alpha = 1; - const float beta = 0; + auto gpuInpReorder = (const InputReorderParameter*)((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second); + auto gpuCol2Im = (const Col2ImParameter*)((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second); + auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second); + const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch(); + int inp_block_num = runtime->blocks_num(rerangeCount); + int inp_thread_num = runtime->threads_num(); - if(use_pad_) { - cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_, - conv_bwd_algo_, workspace_addr, workspace_size_, &beta, padded_desc_, mPadPtr)); + // Do input Rerange + runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half)); + DeconvInputRerange<<>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer); - std::vector out_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow}; + // Do Gemm operation + GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)mIm2ColBuffer, (const half*)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false); - int size = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]; - int block_num = runtime->blocks_num(size); - int threads_num = runtime->threads_num(); + // Do Col2Im trans + int height_col = mCol2ImParamter.ih; + int width_col = mCol2ImParamter.iw; + int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow; - cutPad<<>>(size, (float*)mPadPtr, out_shape[2]+pad_top_+pad_bottom_, out_shape[3]+pad_left_+pad_right_, - out_shape[2], out_shape[3], pad_top_, pad_left_, (float*)output_addr); - } - else { - cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_, - conv_bwd_algo_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr)); - } + int col2im_block_num = runtime->blocks_num(num_kernels); + int col2im_thread_num = runtime->threads_num(); + + // printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \ + // mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \ + // mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \ + // col2im_block_num, col2im_thread_num); + + Col2Im<<>>( + num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, + mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX, + mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX, + height_col, width_col, (const float*)bias_addr, (float *)output_addr); - if(use_bias_) { - cudnn_check(cudnnAddTensor(cudnn_handle_, &alpha, bias_desc_, bias_addr, &alpha, output_desc_, output_addr)); - } - if(use_relu_ || use_relu6_) { - cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr)); - } return NO_ERROR; } @@ -287,7 +326,8 @@ public: MNN_PRINT("Deconv inputs size:3 not support\n"); return nullptr; } else if(inputs.size() == 1) { - return new DeconvSingleInputExecution(backend, op); + std::shared_ptr resource(new DeconvSingleInputExecution::Resource(backend, op)); + return new DeconvSingleInputExecution(backend, op, resource); } else { MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size()); return nullptr; @@ -295,7 +335,7 @@ public: } }; -CUDACreatorRegister __DeConvExecution(OpType_Deconvolution); +//CUDACreatorRegister __DeConvExecution(OpType_Deconvolution); }// namespace CUDA }// namespace MNN diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.hpp b/source/backend/cuda/execution/DeconvSingleInputExecution.hpp index f20ef02f..dec1b951 100644 --- a/source/backend/cuda/execution/DeconvSingleInputExecution.hpp +++ b/source/backend/cuda/execution/DeconvSingleInputExecution.hpp @@ -11,7 +11,7 @@ #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" -#include "half.hpp" +#include "TensorCoreGemm.cuh" namespace MNN { namespace CUDA { @@ -26,9 +26,6 @@ struct KernelInfo { int kernelC = 0; int kernelX = 0; int kernelY = 0; - PadMode padMode = PadMode_CAFFE; - int padX = 0; - int padY = 0; int strideX = 0; int strideY = 0; int dilateX = 0; @@ -36,59 +33,71 @@ struct KernelInfo { int activationType = 0; };// +struct Col2ImParameter { + int padX; + int padY; + int dilateX; + int dilateY; + int strideX; + int strideY; + int kernelX; + int kernelY; + int oc; + int ic; + int iw; + int ih; + int ow; + int oh; + int ob; +}; + +struct InputReorderParameter { + int ic_stride; + int ib_stride; + int oc_stride; + int ob_stride; + int hw_size; + int l_size; + int h_size; + int lpack_size; + int hpack_size; +}; + + extern "C" class DeconvSingleInputExecution : public Execution { public: - DeconvSingleInputExecution(Backend* backend, const MNN::Op* op); + struct Resource { + Resource(Backend* bn, const MNN::Op* op); + ~ Resource(); + void* mFilter; + void* mBias; + std::shared_ptr weightTensor; + std::shared_ptr biasTensor; + KernelInfo mKernelInfo; + Backend* mBackend = nullptr; + }; + DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr res); virtual ~DeconvSingleInputExecution(); virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; private: - cudnnHandle_t cudnn_handle_; - cudnnTensorDescriptor_t input_desc_; - cudnnTensorDescriptor_t output_desc_; - cudnnFilterDescriptor_t filter_desc_; - cudnnConvolutionBwdDataAlgo_t conv_bwd_algo_; - cudnnConvolutionDescriptor_t conv_desc_; - cudnnTensorDescriptor_t bias_desc_; - cudnnTensorDescriptor_t padded_desc_; - cudnnActivationDescriptor_t act_desc_; + std::shared_ptr mResource; - cudnnDataType_t cudnn_data_type_; - int cudnn_data_type_len_; - bool use_pad_ = false; - int pad_top_ = 0; - int pad_bottom_ = 0; - int pad_left_ = 0; - int pad_right_ = 0; + const Op* mOp = nullptr; + MatMulParam mMatMulParam; + std::pair mGpuMatMulParam; - bool use_bias_ = false; - bool use_relu_ = false; - bool use_relu6_ = false; + Col2ImParameter mCol2ImParamter; + std::pair mGpuCol2ImParam; - void* mPadPtr; - void* mFilter; - void* mBias; - void* mWorkSpace; - std::shared_ptr weightTensor; - std::shared_ptr biasTensor; - std::shared_ptr padTensor; - std::shared_ptr workspaceTensor; + InputReorderParameter mInpReorderParameter; + std::pair mGpuInpReorderParam; - std::shared_ptr mPad; - std::shared_ptr mWorkspaceForward; - - size_t input_size_; - size_t filter_size_; - size_t output_size_; - size_t padded_size_; - size_t workspace_size_; - - const MNN::Op* mOp; - KernelInfo mKernelInfo; - IOInfo mIOInfo; - std::shared_ptr mTempInput; + float* mIm2ColBuffer; + __half* mInputBuffer; }; } // namespace CUDA diff --git a/source/backend/cuda/execution/ImageColumn.cu b/source/backend/cuda/execution/ImageColumn.cu new file mode 100644 index 00000000..a50b22bc --- /dev/null +++ b/source/backend/cuda/execution/ImageColumn.cu @@ -0,0 +1,705 @@ +#include "ImageColumn.cuh" +#include "MNNCUDADefine.hpp" +#include "MNNCUDAFunction.cuh" +#include "Raster.cuh" + +#define BLOCK_INT4 2 + +namespace MNN { +namespace CUDA { + +__global__ void Im2Col1x1(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const float* A, + half* AP, + DivModFast eAlignD, + DivModFast owD, + DivModFast ohD + ) { + int eAlign = matmulParam->elhPack[0] * MATMULPACK; + int lAlign = matmulParam->elhPack[1]; + int maxCount = eAlign * lAlign * BLOCK_INT4; + int kernelCount = 1; + for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int index = indexO >> 1; + int lR = indexO & 1; + int eIndex, lIndex; + eAlignD.divmod(index, lIndex, eIndex); + int eU = eIndex >> 4; + int eR = eIndex & 15; + int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8; + int4* dst = (int4*)(AP + dstOffset); + if (eIndex >= matmulParam->elh[0]) { + *dst = {0, 0, 0, 0}; + continue; + } + // Compute for source + int ox, oy, ob; + owD.divmod(eIndex, oy, ox); + ohD.divmod(oy, ob, oy); + int sz = lIndex; + int sx = ox * param->strideX - param->padX; + int sy = oy * param->strideY - param->padY; + if (sx >= 0 && sx < param->iw) { + if (sy >=0 && sy < param->ih) { + int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8; + float2* srcF = (float2*)(A + offset); + half2* dstH = (half2*)dst; + dstH[0] = __float22half2_rn(srcF[0]); + dstH[1] = __float22half2_rn(srcF[1]); + dstH[2] = __float22half2_rn(srcF[2]); + dstH[3] = __float22half2_rn(srcF[3]); + continue; + } + } + *dst = {0, 0, 0, 0}; + } +} + +__global__ void Im2Col1x1_OPT(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const int maxCount, + const float* A, + half* AP, + DivModFast eAlignD, + DivModFast owD, + DivModFast ohD + ) { + for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int index = indexO >> 3; + int lR = indexO & 7; + int eIndex, lIndex; + eAlignD.divmod(index, lIndex, eIndex); + int eU = eIndex >> 4; + int eR = eIndex & 15; + int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1); + + int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1); + float2* srcF = (float2*)(A + offset); + half2* dstH = (half2*)(AP + dstOffset); + dstH[0] = __float22half2_rn(srcF[0]); + } +} + +__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const float* A, + half* AP) { + int eAlign = matmulParam->elhPack[0] * MATMULPACK; + int lAlign = matmulParam->elhPack[1]; + int maxCount = eAlign * lAlign * BLOCK_INT4; + int kernelCount = param->kernelX * param->kernelY; + for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int index = indexO / BLOCK_INT4; + int lR = indexO % BLOCK_INT4; + int eIndex = index % eAlign; + int lIndex = index / eAlign; + int eU = eIndex / MATMULPACK; + int eR = eIndex % MATMULPACK; + int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8; + int4* dst = (int4*)(AP + dstOffset); + if (eIndex >= matmulParam->elh[0]) { + *dst = {0, 0, 0, 0}; + continue; + } + // Compute for source + int ox = eIndex % param->ow; + int oy = eIndex / param->ow; + int ob = oy / param->oh; + oy = oy % param->oh; + int sz = lIndex / kernelCount; + int kI = lIndex % kernelCount; + int ksx = kI % param->kernelX; + int ksy = kI / param->kernelX; + + int sx = ox * param->strideX + ksx * param->dilateX - param->padX; + int sy = oy * param->strideY + ksy * param->dilateY - param->padY; + if (sx >= 0 && sx < param->iw) { + if (sy >=0 && sy < param->ih) { + int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8; + float2* srcF = (float2*)(A + offset); + half2* dstH = (half2*)dst; + dstH[0] = __float22half2_rn(srcF[0]); + dstH[1] = __float22half2_rn(srcF[1]); + dstH[2] = __float22half2_rn(srcF[2]); + dstH[3] = __float22half2_rn(srcF[3]); + continue; + } + } + *dst = {0, 0, 0, 0}; + } +} + +__global__ void Im2Col1x1_half(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const half* A, + half* AP, + DivModFast eAlignD, + DivModFast owD, + DivModFast ohD + ) { +int eAlign = matmulParam->elhPack[0] * MATMULPACK; +int lAlign = matmulParam->elhPack[1]; +int maxCount = eAlign * lAlign * BLOCK_INT4; +int kernelCount = 1; +for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int index = indexO / BLOCK_INT4; + int lR = indexO % BLOCK_INT4; + int eIndex, lIndex; + eAlignD.divmod(index, lIndex, eIndex); + int eU = eIndex / MATMULPACK; + int eR = eIndex % MATMULPACK; + int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8; + int4* dst = (int4*)(AP + dstOffset); + if (eIndex >= matmulParam->elh[0]) { + *dst = {0, 0, 0, 0}; + continue; + } + // Compute for source + int ox, oy, ob; + owD.divmod(eIndex, oy, ox); + ohD.divmod(oy, ob, oy); + int sz = lIndex; + int sx = ox * param->strideX - param->padX; + int sy = oy * param->strideY - param->padY; + if (sx >= 0 && sx < param->iw) { + if (sy >=0 && sy < param->ih) { + int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8; + int4* src = (int4*)(A + offset); + *dst = *src; + continue; + } + } + *dst = {0, 0, 0, 0}; +} +} + +__global__ void Im2Col1x1_half_OPT(const ConvolutionCommon::Im2ColParameter* param, +const MatMulParam* matmulParam, +const int maxCount, +const half* A, +half* AP, +DivModFast eAlignD, +DivModFast owD, +DivModFast ohD +) { +for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + int index = indexO >> 3; + int lR = indexO & 7; + int eIndex, lIndex; + eAlignD.divmod(index, lIndex, eIndex); + int eU = eIndex >> 4; + int eR = eIndex & 15; + int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1); + + int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1); + int* srcF = (int*)(A + offset); + int* dstH = (int*)(AP + dstOffset); + dstH[0] = srcF[0]; +} +} + +__global__ void Im2Col_half(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const int maxCount, + const half* A, + half* AP, + DivModFast d_eA, + DivModFast d_ow, + DivModFast d_oh, + DivModFast d_fxy, + DivModFast d_fx + ) { +int eAlign = matmulParam->elhPack[0] << 4; +int lAlign = matmulParam->elhPack[1]; +int kernelCount = param->kernelX * param->kernelY; +for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + size_t index = indexO >> 1; + size_t lR = indexO & 1; + int eIndex, lIndex; + d_eA.divmod(index, lIndex, eIndex); + size_t eU = eIndex >> 4; + size_t eR = eIndex & 15; + size_t dstOffset = ((((eU * matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 3); + int4* dst = (int4*)(AP + dstOffset); + if (eIndex >= matmulParam->elh[0]) { + *dst = {0, 0, 0, 0}; + continue; + } + // Compute for source + int ox, oby, ob, oy, sz, kI, ksx, ksy; + d_ow.divmod(eIndex, oby, ox); + d_oh.divmod(oby, ob, oy); + d_fxy.divmod(lIndex, sz, kI); + d_fx.divmod(kI, ksy, ksx); + + size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX; + size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY; + if (sx >= 0 && sx < param->iw) { + if (sy >=0 && sy < param->ih) { + size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + lR * 8; + int4* src = (int4*)(A + offset); + *dst = *src; + continue; + } + } + *dst = {0, 0, 0, 0}; +} +} + +__global__ void Im2Col_half_OPT(const ConvolutionCommon::Im2ColParameter* param, + const MatMulParam* matmulParam, + const size_t maxCount, + const half* A, + half* AP, + DivModFast d_eA, + DivModFast d_ow, + DivModFast d_oh, + DivModFast d_fxy, + DivModFast d_fx +) { +size_t eAlign = matmulParam->elhPack[0] << 4; +size_t lAlign = matmulParam->elhPack[1]; +size_t kernelCount = param->kernelX * param->kernelY; +for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { + size_t index = indexO >> 2; + size_t lR = indexO & 3; + int eIndex, lIndex; + d_eA.divmod(index, lIndex, eIndex); + size_t eU = eIndex >> 4; + size_t eR = eIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 2); + int2* dst = (int2*)(AP + dstOffset); + if (eIndex >= matmulParam->elh[0]) { + *dst = {0, 0}; + continue; + } + + // Compute for source + int ox, oby, ob, oy, sz, kI, ksx, ksy; + d_ow.divmod(eIndex, oby, ox); + d_oh.divmod(oby, ob, oy); + d_fxy.divmod(lIndex, sz, kI); + d_fx.divmod(kI, ksy, ksx); + + size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX; + size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY; + if (sx >= 0 && sx < param->iw) { + if (sy >=0 && sy < param->ih) { + size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + (lR << 2); + int2* src = (int2*)(A + offset); + *dst = *src; + continue; + } + } + *dst = {0, 0}; +} +} + + +__global__ void Im2Col_half_3x3S1D1P1_OPT2(const ConvolutionCommon::Im2ColParameter* param, +const MatMulParam* matmulParam, +const size_t maxCount, +const half* A, +half* AP, +DivModFast d_eA, +DivModFast d_ow, +DivModFast d_oh +) { +for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) { +size_t index = indexO >> 3; +size_t lR = indexO & 7; +int eIndex, lIndex; +d_eA.divmod(index, lIndex, eIndex); + +int ix, oby, ob, iy; +d_ow.divmod(eIndex, oby, ix); +d_oh.divmod(oby, ob, iy); +size_t sz = lIndex; + +size_t offset = sz * param->srcZStep + (((ob * param->ih + iy) * param->iw + ix) << 4) + (lR << 1); +int src = *((int*)(A + offset)); + +// Pixel (iy-1, ix-1) +if(iy-1 >=0 && ix-1 >=0) { + size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix-1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 8) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy-1 ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix-1 ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy-1, ix+0) +if(iy-1 >=0) { + size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+0)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 7) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy-1 ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy-1, ix+1) +if(iy-1 >=0 && ix+1 < param->iw) { + size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 6) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy-1 ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix+1 == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy+0, ix-1) +if(ix-1 >=0) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix-1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 5) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(iy == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix-1 ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy, ix) +if(1) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+0)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 4) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(iy == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy, ix+1) +if(ix+1 < param->iw) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 3) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy ==0) { + size_t index[3] = {0, 1, 2}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(iy == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix+1 == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy+1, ix-1) +if(iy+1 < param->ih && ix-1 >=0) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix-1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 2) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy+1 == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix-1 ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +// Pixel (iy+1, ix) +if(iy+1 < param->ih) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+0)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 1) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy+1 == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix ==0) { + size_t index[3] = {0, 3, 6}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} + +//Pixel (iy+1, ix+1) +if(iy+1 < param->ih && ix+1 < param->iw) { + size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+1)); + size_t eU = oeIndex >> 4; + size_t eR = oeIndex & 15; + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 0) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = src; + + // Corner case + if(iy+1 == param->ih-1) { + size_t index[3] = {6, 7, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } + if(ix+1 == param->iw-1) { + size_t index[3] = {2, 5, 8}; + for(size_t i=0; i<3; i++) { + size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1); + int* dst = (int*)(AP + dstOffset); + *dst = 0; + } + } +} +} +} + + + +void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col,\ + const void* input_addr, __half* mIm2ColBuffer, int bytes) { + + size_t eAlign = cpuMatlMul->elhPack[0] * MATMULPACK; + size_t lAlign = cpuMatlMul->elhPack[1]; + + DivModFast eAlignD(eAlign); + DivModFast owD(cpuIm2Col->ow); + DivModFast ohD(cpuIm2Col->oh); + + if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1 && \ + cpuMatlMul->elh[0] % 16 == 0 && \ + cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \ + cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \ + cpuIm2Col->padX == 0 && cpuIm2Col->padY == 0) { + + size_t maxCount = eAlign * lAlign * 8;//Align 2 + int block_num = runtime->blocks_num(maxCount); + int block_size = runtime->threads_num(); + if(bytes == 4) { + Im2Col1x1_OPT<<>>(gpuIm2Col, gpuMatMul, maxCount, + (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD); + checkKernelErrors; + } else { + Im2Col1x1_half_OPT<<>>(gpuIm2Col, gpuMatMul, maxCount, + (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD); + checkKernelErrors; + } + } else if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1) { + size_t maxCount = eAlign * lAlign * 2;//Align 8 + int block_num = runtime->blocks_num(maxCount); + int block_size = runtime->threads_num(); + if(bytes == 4) { + Im2Col1x1<<>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD); + checkKernelErrors; + } else { + Im2Col1x1_half<<>>(gpuIm2Col, gpuMatMul, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD); + checkKernelErrors; + } + } else if(cpuIm2Col->kernelX == 3 && cpuIm2Col->kernelY == 3 && \ + cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \ + cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \ + cpuIm2Col->padX == 1 && cpuIm2Col->padY == 1 && \ + bytes == 2) { + + size_t maxCount = eAlign * (lAlign / 9) * 8; + size_t block_num = runtime->blocks_num(maxCount); + size_t block_size = runtime->threads_num(); + + //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign); + Im2Col_half_3x3S1D1P1_OPT2<<>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,\ + eAlignD, owD, ohD); + checkKernelErrors; + } else { + size_t maxCount = eAlign * lAlign * 2; + size_t block_num = runtime->blocks_num(maxCount); + size_t block_size = runtime->threads_num(); + if(bytes == 4) { + Im2Col<<>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer); + checkKernelErrors; + } else { + + DivModFast fxyD((cpuIm2Col->kernelX*cpuIm2Col->kernelY)); + DivModFast fxD(cpuIm2Col->kernelX); + maxCount = eAlign * lAlign * 4; + block_num = runtime->blocks_num(maxCount); + block_size = runtime->threads_num(); + + //Im2Col_half<<>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD, fxyD, fxD); + + Im2Col_half_OPT<<>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer, + eAlignD, owD, ohD, fxyD, fxD); + checkKernelErrors; + } + } +} + +} // namespace CUDA +} // namespace MNN \ No newline at end of file diff --git a/source/backend/cuda/execution/ImageColumn.cuh b/source/backend/cuda/execution/ImageColumn.cuh new file mode 100644 index 00000000..ec44a1b6 --- /dev/null +++ b/source/backend/cuda/execution/ImageColumn.cuh @@ -0,0 +1,24 @@ +// +// ImageColumn.cuh +// MNN +// +// Created by MNN on 2021/01/10. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef IMAGE_COLUMN_CUH +#define IMAGE_COLUMN_CUH + +#include "backend/cuda/core/runtime/CUDARuntime.hpp" +#include "TensorCoreGemm.cuh" +#include "backend/cuda/core/CUDABackend.hpp" + +namespace MNN { +namespace CUDA { + +void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col, const void* input_addr, __half* mIm2ColBuffer, int bytes); + +} // namespace CUDA +} // namespace MNN +#endif + diff --git a/source/backend/cuda/execution/InterpExecution.cu b/source/backend/cuda/execution/InterpExecution.cu index 11396309..7202b1f5 100644 --- a/source/backend/cuda/execution/InterpExecution.cu +++ b/source/backend/cuda/execution/InterpExecution.cu @@ -1,27 +1,51 @@ #include "InterpExecution.hpp" +#include "MNNCUDADefine.hpp" +#include "MNNCUDAFunction.cuh" + namespace MNN { namespace CUDA { - #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) template -__global__ void INTERP(const int n, const int ih, const int iw, const int oh, const int ow, +__global__ void INTERP_NERAEST(const int n, const int ih, const int iw, const int oh, const int ow, const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_KERNEL_LOOP(total, n) { + int index = total / PACK_NUMBER; + int remain = total % PACK_NUMBER; int x = index % ow; int tmp = index / ow; int y = tmp % oh; int z = tmp / oh; int ix = min(max(0, (int)floor((float)x*scalew+offsetw)), iw-1); int iy = min(max(0, (int)floor((float)y*scaleh+offseth)), ih-1); - out[z*oh*ow + y*ow + x] = in[z*ih*iw + iy*iw + ix]; + out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] + = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain]; + } +} + +template +__global__ void INTERP_NERAEST_ROUND(const int n, const int ih, const int iw, const int oh, const int ow, + const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) { + CUDA_KERNEL_LOOP(total, n) { + int index = total / PACK_NUMBER; + int remain = total % PACK_NUMBER; + int x = index % ow; + int tmp = index / ow; + int y = tmp % oh; + int z = tmp / oh; + int ix = min(max(0, (int)floor((float)x*scalew+offsetw + 0.499f)), iw-1); + int iy = min(max(0, (int)floor((float)y*scaleh+offseth + 0.499f)), ih-1); + out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] + = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain]; } } template __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const int oh, const int ow, const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_KERNEL_LOOP(total, n) { + int index = total / PACK_NUMBER; + int remain = total % PACK_NUMBER; int x = index % ow; int tmp = index / ow; int y = tmp % oh; @@ -37,11 +61,97 @@ __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const i int index_01 = z*ih*iw + iy_0*iw + ix_1; int index_10 = z*ih*iw + iy_1*iw + ix_0; int index_11 = z*ih*iw + iy_1*iw + ix_1; + index_00 = index_00 * PACK_NUMBER + remain; + index_01 = index_01 * PACK_NUMBER + remain; + index_10 = index_10 * PACK_NUMBER + remain; + index_11 = index_11 * PACK_NUMBER + remain; float factor_x = fx-ix_0; float factor_y = fy-iy_0; - out[z*oh*ow + y*ow + x] = (1.0-factor_x)*(1.0-factor_y)*in[index_00] + factor_x*(1.0-factor_y)*in[index_01] + - (1.0-factor_x)*factor_y*in[index_10] + factor_x*factor_y*in[index_11]; + out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] = + (1.0-factor_x)*(1.0-factor_y)*(float)in[index_00] + factor_x*(1.0-factor_y)*(float)in[index_01] + + (1.0-factor_x)*factor_y*(float)in[index_10] + factor_x*factor_y*(float)in[index_11]; + } +} + +template +__global__ void INTERP_BILINEAR_OPT(const int n, const int ih, const int iw, const int oh, const int ow, + const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out, + DivModFast d_ow, DivModFast d_oh) { + CUDA_KERNEL_LOOP(total, n) { + int index = total >> 4; + int remain = total & 15; + + int tmp, x_idx, y, z; + d_ow.divmod(index, tmp, x_idx); + d_oh.divmod(tmp, z, y); + + size_t x = x_idx << 1; + float fx = x*scalew+offsetw; + int ix_0 = min(max(0, (int)floor(fx)), iw-1); + int ix_1 = min((int)ceil(fx), iw-1); + + float fx_1 = fx + scalew; + int ix_2 = min(max(0, (int)floor(fx_1)), iw-1); + int ix_3 = min((int)ceil(fx_1), iw-1); + + float fy = y*scaleh+offseth; + int iy_0 = min(max(0, (int)floor(fy)), ih-1); + int iy_1 = min((int)ceil(fy), ih-1); + + int index_00 = (z*ih+ iy_0)*iw + ix_0; + int index_01 = index_00 - ix_0 + ix_1; + int index_10 = (z*ih+ iy_1)*iw + ix_0; + int index_11 = index_10 - ix_0 + ix_1; + index_00 = (index_00 << 4) + remain; + index_01 = (index_01 << 4) + remain; + index_10 = (index_10 << 4) + remain; + index_11 = (index_11 << 4) + remain; + + float factor_x = fx-ix_0; + float factor_y = fy-iy_0; + float in_00 = (float)in[index_00]; + float in_01 = (float)in[index_01]; + float in_10 = (float)in[index_10]; + float in_11 = (float)in[index_11]; + + float factor_00 = (1.0-factor_x)*(1.0-factor_y); + float factor_01 = factor_x*(1.0-factor_y); + float factor_10 = (1.0-factor_x)*factor_y; + float factor_11 = factor_x*factor_y; + + size_t dstOffset = (((z*oh+ y)*ow + x) << 4) + remain; + out[dstOffset] = \ + factor_00* in_00 + factor_01*in_01 + \ + factor_10* in_10 + factor_11*in_11; + + if(x+1 >= ow) { + continue; + } + + if(ix_2 != ix_0) { + index_00 = index_00 + ((ix_2-ix_0) << 4); + index_10 = index_10 + ((ix_2-ix_0) << 4); + in_00 = (float)in[index_00]; + in_10 = (float)in[index_10]; + } + if(ix_3 != ix_1) { + index_01 = index_01 + ((ix_3-ix_1) << 4); + index_11 = index_11 + ((ix_3-ix_1) << 4); + in_01 = (float)in[index_01]; + in_11 = (float)in[index_11]; + } + + if(factor_x != fx_1-ix_2) { + factor_x = fx_1-ix_2; + factor_00 = (1.0-factor_x)*(1.0-factor_y); + factor_01 = factor_x*(1.0-factor_y); + factor_10 = (1.0-factor_x)*factor_y; + factor_11 = factor_x*factor_y; + } + out[dstOffset+ PACK_NUMBER] = \ + factor_00* in_00 + factor_01*in_01 + \ + factor_10* in_10 + factor_11*in_11; } } @@ -70,7 +180,7 @@ ErrorCode InterpExecution::onResize(const std::vector &inputs, const s mOutputHeight = output->height(); mOutputWidth = output->width(); - mCount = mBatch*mChannel*mOutputHeight*mOutputWidth; + mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*mOutputWidth * PACK_NUMBER; //printf("mBatch:%d-mChannel:%d-mInputHeight:%d- mInputWidth:%d- mOutputHeight:%d- mOutputWidth:%d, mScaleHeight:%f- mScaleWidth:%f %f %f\n", mBatch, mChannel, mInputHeight,mInputWidth,mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mWidthOffset, mHeightOffset); return NO_ERROR; } @@ -82,13 +192,39 @@ ErrorCode InterpExecution::onExecute(const std::vector &inputs, const int threads_num = runtime->threads_num(); auto input_addr = (void*)inputs[0]->deviceId(); auto output_addr = (void*)outputs[0]->deviceId(); + if (static_cast(backend())->useFp16()) { + if(mResizeType == 1){ + INTERP_NERAEST<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, + mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr); + } else if(mResizeType == 2) { + //INTERP_BILINEAR<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\ + mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr); + + mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*((mOutputWidth+1)/ 2) * PACK_NUMBER; + block_num = runtime->blocks_num(mCount); + threads_num = runtime->threads_num(); + + DivModFast d_ow((mOutputWidth+1)/2); + DivModFast d_oh(mOutputHeight); + INTERP_BILINEAR_OPT<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\ + mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr, d_ow, d_oh); + + } else if (mResizeType == 4) { + INTERP_NERAEST_ROUND<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, + mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr); + } + return NO_ERROR; + } if(mResizeType == 1){ - INTERP<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, + INTERP_NERAEST<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr); } else if(mResizeType == 2) { INTERP_BILINEAR<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr); + } else if (mResizeType == 4) { + INTERP_NERAEST_ROUND<<>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth, + mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr); } return NO_ERROR; } @@ -98,7 +234,7 @@ public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* backend) const override { auto param = op->main_as_Interp(); - if(param->resizeType() != 1 && param->resizeType() != 2) { + if(param->resizeType() == 3) { MNN_PRINT("CUDA interp resize type:%d not support, back to CPU\n", param->resizeType()); return nullptr; } diff --git a/source/backend/cuda/execution/LayerNormExecution.cu b/source/backend/cuda/execution/LayerNormExecution.cu index 1d9d2e03..b1da15a7 100644 --- a/source/backend/cuda/execution/LayerNormExecution.cu +++ b/source/backend/cuda/execution/LayerNormExecution.cu @@ -38,7 +38,7 @@ T blockReduceSum(T val) template __global__ -void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl) +void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl) { int tid = threadIdx.x; @@ -60,7 +60,7 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int float var_tmp = 0.0f; for(int idx=0; idx(var_tmp); if(threadIdx.x == 0) @@ -69,14 +69,14 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int for(int idx=0; idx __global__ -void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon) +void input_layernorm_2048(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon) { int tid = threadIdx.x; @@ -128,7 +128,7 @@ void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, template __global__ -void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon) +void input_layernorm_1024(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon) { int tid = threadIdx.x; @@ -176,7 +176,7 @@ void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, template __global__ -void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon) +void input_layernorm_512(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon) { int tid = threadIdx.x; @@ -217,25 +217,25 @@ void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, template __global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon, - const T* in, T* out, const T* gamma_data, const T* beta_data) { + const T* in, T* out, const float* gamma_data, const float* beta_data) { CUDA_KERNEL_LOOP(i, count) { const int o = i / inside; const int index = i % inside; const T* inner_input = in + o * inside; T* inner_output = out + o * inside; - T sum = 0.f; + float sum = 0.f; for (int j = 0; j < inside; ++j) { - sum += inner_input[j]; + sum += (float)inner_input[j]; } - T mean = sum / inside; - T square_sum = 0.f; + float mean = sum / inside; + float square_sum = 0.f; for (int j = 0; j < inside; ++j) { - square_sum += (inner_input[j] - mean) * (inner_input[j] - mean); + square_sum += ((float)inner_input[j] - mean) * ((float)inner_input[j] - mean); } - T variable = square_sum / inside; + float variable = square_sum / inside; variable = 1.f / sqrt(variable + epsilon); - inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index]; + inner_output[index] = ((float)inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index]; } } @@ -249,7 +249,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen mEps = layer_norm_param->epsilon(); int size = layer_norm_param->gamma()->size(); - mGammaTensor.reset(Tensor::createDevice({size})); + mGammaTensor.reset(Tensor::createDevice({size})); auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC); if (!status) { MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n"); @@ -262,7 +262,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen if (layer_norm_param->beta()->size() != size) { MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n"); } - mBetaTensor.reset(Tensor::createDevice({size})); + mBetaTensor.reset(Tensor::createDevice({size})); status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC); if (!status) { MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n"); @@ -274,12 +274,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen } LayerNormExecution::~LayerNormExecution() { - if (nullptr != mGammaTensor) { - backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC); - } - if (nullptr != mBetaTensor) { - backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC); - } + // Do nothing } ErrorCode LayerNormExecution::onResize(const std::vector &inputs, const std::vector &outputs) { @@ -314,6 +309,28 @@ ErrorCode LayerNormExecution::onExecute(const std::vector &inputs, con int threads_num = runtime->threads_num(); auto input_addr = (void*)inputs[0]->deviceId(); auto output_addr = (void*)outputs[0]->deviceId(); + if (static_cast(backend())->useFp16()) { + if(mInside < 128) { + LAYERNORM<<>>(mOutside*mInside, mOutside, mInside, mEps, (const half *)input_addr, (half *)output_addr, + (const float *)mDeviceGamma, (const float *)mDeviceBeta); + } else { + if(mInside == 2048) { + input_layernorm_2048<<>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, + (const float *)mDeviceBeta, mOutside, mInside, mEps); + } else if(mInside == 1024) { + input_layernorm_1024<<>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, + (const float *)mDeviceBeta, mOutside, mInside, mEps); + } else if(mInside == 512) { + input_layernorm_512<<>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, + (const float *)mDeviceBeta, mOutside, mInside, mEps); + } else { + int sumPerKnl = (mInside+255) / 256; + input_layernorm<<>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, + (const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl); + } + } + return NO_ERROR; + } if(mInside < 128) { LAYERNORM<<>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr, diff --git a/source/backend/cuda/execution/CUDALoop.cpp b/source/backend/cuda/execution/LoopExecution.cpp similarity index 88% rename from source/backend/cuda/execution/CUDALoop.cpp rename to source/backend/cuda/execution/LoopExecution.cpp index cedd936e..bb66be80 100644 --- a/source/backend/cuda/execution/CUDALoop.cpp +++ b/source/backend/cuda/execution/LoopExecution.cpp @@ -6,7 +6,6 @@ // Copyright © 2018, Alibaba Group Holding Limited // #include -#include "BatchMatMulExecution.hpp" #include "MatMulExecution.hpp" #include "backend/cuda/core/CUDABackend.hpp" #include "Raster.cuh" @@ -34,18 +33,21 @@ public: auto cmd = mLoop->commands()->GetAs(0); auto op = cmd->op(); if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) { - auto& unit = mExecutions[0]; - unit.exe.reset(new BatchMatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend())); - if (nullptr == unit.exe) { - return OUT_OF_MEMORY; - } - unit.inputs = inputs; - unit.outputs = outputs; - auto code = unit.exe->onResize(unit.inputs, unit.outputs); - if (NO_ERROR != code) { - return code; + if (inputs.size() <= 3) { + auto& unit = mExecutions[0]; + unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), backend())); + if (nullptr == unit.exe) { + return OUT_OF_MEMORY; + } + unit.inputs = inputs; + unit.outputs = outputs; + auto code = unit.exe->onResize(unit.inputs, unit.outputs); + if (NO_ERROR != code) { + return code; + } + mSingleMatMul = true; + return NO_ERROR; } - return NO_ERROR; } } @@ -134,21 +136,22 @@ public: virtual ErrorCode onExecute(const std::vector &originInputs, const std::vector &originOutputs) override { auto runtime = static_cast(backend())->getCUDARuntime(); + if (mSingleMatMul) { + auto& unit = mExecutions[0]; + unit.inputs = originInputs; + unit.outputs = originOutputs; + + auto code = unit.exe->onExecute(unit.inputs, unit.outputs); + if (NO_ERROR != code) { + return code; + } + return NO_ERROR; + } if (1 == mLoop->commands()->size()) { auto cmd = mLoop->commands()->GetAs(0); auto op = cmd->op(); - if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) { - auto& unit = mExecutions[0]; - unit.inputs = originInputs; - unit.outputs = originOutputs; - auto code = unit.exe->onExecute(unit.inputs, unit.outputs); - if (NO_ERROR != code) { - return code; - } - return NO_ERROR; - } if (OpType_UnaryOp == op->type() && nullptr == op->main()) { Tensor::InsideDescribe::Region reg; @@ -160,7 +163,7 @@ public: auto input = mStack[cmd->indexes()->data()[1]]; auto inputSize = input->elementSize(); auto output = mStack[cmd->indexes()->data()[0]]; - auto bytes = input->getType().bytes(); + auto bytes = static_cast(backend())->getBytes(input); auto step0 = cmd->steps()->data()[0]; auto step1 = cmd->steps()->data()[1]; auto loopNumber = mLoop->loopNumber(); @@ -189,7 +192,7 @@ public: for (auto& iter : mIndiceCopy) { backend()->onCopyBuffer(iter.first, iter.second); } - auto bytes = sizeof(float);//TODO: Support Half + auto bytes = static_cast(backend())->getBytes(originOutputs[0]); for (int iter=0; iter < mLoop->loopNumber(); ++iter) { for (int index=0; indexcommands()->size(); ++index) { auto cmd = mLoop->commands()->GetAs(index); @@ -205,7 +208,7 @@ public: } auto view = cmd->view()->GetAs(v); offset = offset * cmd->steps()->data()[v] + view->offset(); - mStackPtr[tensorIndex] = tensor->deviceId() + offset * bytes; + mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast(backend())->getBytes(tensor); } if (OpType_UnaryOp == op->type()) { auto src = (float*)mStackPtr[cmd->indexes()->data()[1]]; @@ -233,6 +236,10 @@ public: continue; } if (OpType_BinaryOp == op->type()) { + auto type = halide_type_of(); + if (static_cast(backend())->useFp16()) { + type.bits = 16; + } auto src0 = mStackPtr[cmd->indexes()->data()[1]]; auto src1 = mStackPtr[cmd->indexes()->data()[2]]; auto dst = mStackPtr[cmd->indexes()->data()[0]]; @@ -242,7 +249,7 @@ public: auto dstStride = cmd->view()->GetAs(0)->stride()->data(); BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1, - cmd->size()->data(), srcStride0, srcStride1, dstStride, halide_type_of(), runtime, opType); + cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType); } } @@ -256,6 +263,7 @@ private: std::vector mExecutions; std::vector mStackPtr; std::map mIndiceCopy; + bool mSingleMatMul = false; }; class LoopCreator : public CUDABackend::Creator { diff --git a/source/backend/cuda/execution/MNNCUDADefine.hpp b/source/backend/cuda/execution/MNNCUDADefine.hpp new file mode 100644 index 00000000..71992c39 --- /dev/null +++ b/source/backend/cuda/execution/MNNCUDADefine.hpp @@ -0,0 +1,18 @@ +#ifndef MNNCUDADEFINE_HPP +#define MNNCUDADEFINE_HPP + +#define PACK_NUMBER 16 + +#define MNN_CUDA_HALF2_MAX(a, b) \ + do { \ + (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \ + (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \ + } while (0) + +#define MNN_CUDA_HALF2_MIN(a, b) \ + do { \ + (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \ + (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \ + } while (0) + +#endif diff --git a/source/backend/cuda/execution/MNNCUDAFunction.cuh b/source/backend/cuda/execution/MNNCUDAFunction.cuh new file mode 100644 index 00000000..9585d60c --- /dev/null +++ b/source/backend/cuda/execution/MNNCUDAFunction.cuh @@ -0,0 +1,38 @@ +#ifndef MNNCUDAFunction_cuh +#define MNNCUDAFunction_cuh + +struct DivModFast { + DivModFast(int d = 1) + { + d_ = (d == 0) ? 1 : d; + for (l_ = 0;; ++l_) { + if ((1U << l_) >= d_) + break; + } + uint64_t one = 1; + uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1; + m_ = static_cast(m); + } + + __device__ __inline__ int div(int idx) const + { + uint32_t tm = __umulhi(m_, idx); // get high 32-bit of the product + return (tm + idx) >> l_; + } + + __device__ __inline__ int mod(int idx) const + { + return idx - d_ * div(idx); + } + + __device__ __inline__ void divmod(int idx, int &quo, int &rem) + { + quo = div(idx); + rem = idx - quo * d_; + } + + uint32_t d_; // divisor + uint32_t l_; // ceil(log2(d_)) + uint32_t m_; // m' in the papaer +}; +#endif \ No newline at end of file diff --git a/source/backend/cuda/execution/MatMulExecution.cu b/source/backend/cuda/execution/MatMulExecution.cu index f285af79..1bca5a98 100644 --- a/source/backend/cuda/execution/MatMulExecution.cu +++ b/source/backend/cuda/execution/MatMulExecution.cu @@ -15,12 +15,18 @@ MatMulExecution::~ MatMulExecution() { } ErrorCode MatMulExecution::onResize(const std::vector &inputs, const std::vector &outputs) { - auto w0 = inputs[0]->length(1); - auto h0 = inputs[0]->length(0); + auto runtime = static_cast(backend())->getCUDARuntime(); auto C = outputs[0]; + auto dimensions = C->dimensions(); + int batch = 1; + for (int i = 0; i < dimensions - 2; ++i) { + batch *= C->length(i); + } + auto e = C->length(dimensions-2); + auto h = C->length(dimensions-1); + auto w0 = inputs[0]->length(dimensions-1); + auto h0 = inputs[0]->length(dimensions-2); - auto e = C->length(0); - auto h = C->length(1); auto l = w0; if (mTransposeA) { l = h0; @@ -29,6 +35,7 @@ ErrorCode MatMulExecution::onResize(const std::vector &inputs, const s param.elh[0] = e; param.elh[1] = l; param.elh[2] = h; + param.batch = batch; auto eU = UP_DIV(e, PACK_MATMUL); auto lU = UP_DIV(l, PACK_MATMUL); auto hU = UP_DIV(h, PACK_MATMUL); @@ -58,15 +65,17 @@ ErrorCode MatMulExecution::onResize(const std::vector &inputs, const s param.cStride[0] = h; param.cStride[1] = 0; param.cStride[2] = 1; - param.split[0] = 1; - param.split[1] = 1; - param.split[2] = 1; - auto runtime = static_cast(backend())->getCUDARuntime(); + param.aPStride[0] = 256 * lU; + param.aPStride[1] = 16; + param.aPStride[2] = 16 * lU; + param.bPStride[0] = 256 * lU; + param.bPStride[1] = 16; + param.bPStride[2] = 16 * lU; runtime->memcpy((uint8_t*)mParameters.first + mParameters.second, ¶m, sizeof(MatMulParam), MNNMemcpyHostToDevice); // Alloc for temp buffer - auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL; - auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL; + auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL * batch; + auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL * batch; auto pool = static_cast(backend())->getBufferPool(); mTempA = pool->alloc(aPackSize * sizeof(__half), false, 256); @@ -85,6 +94,11 @@ ErrorCode MatMulExecution::onExecute(const std::vector &inputs, const auto APtr = (const float*)A->deviceId(); auto BPtr = (const float*)B->deviceId(); auto CDestPtr = (float*)C->deviceId(); + int e = mParam.elh[0]; + int l = mParam.elh[1]; + int h = mParam.elh[2]; + int batch = mParam.batch; + auto bytes = static_cast(backend())->getBytes(inputs[0]); auto aP = (__half*)((uint8_t*)mTempA.first + mTempA.second); auto bP = (__half*)((uint8_t*)mTempB.first + mTempB.second); @@ -93,53 +107,8 @@ ErrorCode MatMulExecution::onExecute(const std::vector &inputs, const biasPtr = (const float*)inputs[2]->deviceId(); } auto param = (MatMulParam*)((uint8_t*)mParameters.first + mParameters.second); - GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP); - GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr); - return NO_ERROR; - - auto blasHandle = runtime->cublas_handle(); - auto w0 = inputs[0]->length(1); - auto h0 = inputs[0]->length(0); - - auto e = C->length(0); - auto h = C->length(1); - auto l = w0; - if (mTransposeA) { - l = h0; - } - - float alpha = 1.0f; - float beta = 0.0f; - - auto tranB = CUBLAS_OP_N; - auto ldB = h; - if (mTransposeB) { - ldB = l; - tranB = CUBLAS_OP_T; - } - auto tranA = CUBLAS_OP_N; - auto ldA = l; - if (mTransposeA) { - ldA = e; - tranA = CUBLAS_OP_T; - } - int block_num = runtime->blocks_num(e*h); - int threads_num = runtime->threads_num(); - - //[e, l] x [l, h] -> [e, h] - auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h); - cublas_check(status); - //cudaThreadSynchronize(); - // } else { - // auto CPtr = (float*)mTempOutput->deviceId(); - // auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h); - // cublas_check(status); - // //cudaThreadSynchronize(); - - // //bias: [e, h] + [h] -> [e, h] - // add_bias<<>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h); - // } - + GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP, bytes); + GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr, bytes, false, false); return NO_ERROR; } diff --git a/source/backend/cuda/execution/MatMulExecution.hpp b/source/backend/cuda/execution/MatMulExecution.hpp index d1aa95b6..4c24b75f 100644 --- a/source/backend/cuda/execution/MatMulExecution.hpp +++ b/source/backend/cuda/execution/MatMulExecution.hpp @@ -28,6 +28,7 @@ private: std::pair mTempB; std::pair mParameters; // In GPU MatMulParam mParam; // In CPU + bool mUseBlas = false; }; } // namespace CUDA } // namespace MNN diff --git a/source/backend/cuda/execution/PReLUExecution.cu b/source/backend/cuda/execution/PReLUExecution.cu index c0a80b49..8f3efb22 100644 --- a/source/backend/cuda/execution/PReLUExecution.cu +++ b/source/backend/cuda/execution/PReLUExecution.cu @@ -1,62 +1,71 @@ #include "PReLUExecution.hpp" +#include "MNNCUDADefine.hpp" namespace MNN { namespace CUDA { - #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) template __global__ void PRELU(const int n, const int channels, const int dim, const T* in, T* out, - const T* slopeData, int div_factor) { - CUDA_KERNEL_LOOP(index, n) { + const float* slopeData, int div_factor) { + CUDA_KERNEL_LOOP(t, n) { + int index = t / PACK_NUMBER; + int r = t % PACK_NUMBER; int c = (index / dim) % channels / div_factor; - out[index] = in[index] > 0 ? in[index] : in[index]*slopeData[c]; + float iv = (float)in[t]; + float ov = iv > 0.0 ? iv : iv * slopeData[c * PACK_NUMBER + r]; + out[t] = (T)ov; } } PReLUExecution::PReLUExecution(const PRelu* prelu, Backend *backend) : Execution(backend) { int slopCount = prelu->slope()->size(); auto alphaData = prelu->slope()->data(); - preluTensor.reset(Tensor::createDevice({slopCount})); - backend->onAcquireBuffer(preluTensor.get(), Backend::STATIC); - mDeviceSlope = (void *)preluTensor.get()->buffer().device; + auto staticPool = static_cast(backend)->getStaticBufferPool(); + auto slopeSize = UP_DIV(slopCount, PACK_NUMBER) * PACK_NUMBER * sizeof(float); + mPreluStorage = staticPool->alloc(slopeSize); + mDeviceSlope = (uint8_t*)mPreluStorage.first + mPreluStorage.second; MNN_ASSERT(nullptr != mDeviceSlope); + cudaMemset(mDeviceSlope, 0, slopeSize); cudaMemcpy(mDeviceSlope, alphaData, slopCount * sizeof(float), cudaMemcpyHostToDevice); mIsChannelShared = slopCount == 1; - } PReLUExecution::~PReLUExecution() { - if (nullptr != preluTensor) { - backend()->onReleaseBuffer(preluTensor.get(), Backend::STATIC); - } + auto staticPool = static_cast(backend())->getStaticBufferPool(); + staticPool->free(mPreluStorage); } ErrorCode PReLUExecution::onResize(const std::vector &inputs, const std::vector &outputs) { MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); auto input = inputs[0]; - mBatch = input->length(0); - mChannel = input->length(1); MNN_ASSERT(input->dimensions() >= 2); - mArea = 1; + mArea = input->length(0); for (int i = 2; i < input->dimensions(); ++i) { mArea *= input->length(i); } - mCount = mBatch*mChannel*mArea; + mChannel = UP_DIV(input->length(1), PACK_NUMBER); + mCount = mChannel*mArea * PACK_NUMBER; //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount); return NO_ERROR; } ErrorCode PReLUExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { auto runtime = static_cast(backend())->getCUDARuntime(); + auto bytes = static_cast(backend())->getBytes(inputs[0]); int block_num = runtime->blocks_num(mCount); int threads_num = runtime->threads_num(); auto input_addr = (void*)inputs[0]->deviceId(); auto output_addr = (void*)outputs[0]->deviceId(); int div_factor = mIsChannelShared ? mChannel : 1; - PRELU<<>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr, - (const float *)mDeviceSlope, div_factor); + if (2 == bytes) { + PRELU<<>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr, + (const float *)mDeviceSlope, div_factor); + } else { + PRELU<<>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr, + (const float *)mDeviceSlope, div_factor); + } return NO_ERROR; } diff --git a/source/backend/cuda/execution/PReLUExecution.hpp b/source/backend/cuda/execution/PReLUExecution.hpp index 785db589..8f121187 100644 --- a/source/backend/cuda/execution/PReLUExecution.hpp +++ b/source/backend/cuda/execution/PReLUExecution.hpp @@ -29,11 +29,9 @@ private: CUDARuntime *mRuntime; void *mDeviceSlope = nullptr; int mCount; - int mBatch; int mChannel; int mArea; - - std::shared_ptr preluTensor; + std::pair mPreluStorage; bool mIsChannelShared = false; }; diff --git a/source/backend/cuda/execution/PoolExecution.cu b/source/backend/cuda/execution/PoolExecution.cu index 2ea3bd1c..483c399c 100755 --- a/source/backend/cuda/execution/PoolExecution.cu +++ b/source/backend/cuda/execution/PoolExecution.cu @@ -1,90 +1,209 @@ +#include #include "PoolExecution.hpp" +#include +#include "MNNCUDADefine.hpp" namespace MNN { namespace CUDA { -template -__global__ void avgpool(const T* uInput, T* uOutput, - int bc, - int ih, int iw, - int oh, int ow, - int padX, int padY, - int kernelX, int kernelY, - int strideX, int strideY - ) { - int total = bc * oh * ow; +#define HALF_MIN half(-65504) +#define HALF2_MIN half2(-65504, -65504) +#define MNN_CUDA_HALF2_MAX(a, b) \ + do { \ + (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \ + (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \ + } while (0) + +__global__ void maxpool_halfC16(const half* uInput, half* uOutput, + int bc, + int ih, int iw, + int oh, int ow, + int padX, int padY, + int kernelX, int kernelY, + int strideX, int strideY + ) { + int total = bc * oh * ow * 8; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { int x = i % ow; int tmp = i / ow; int y = tmp % oh; int z = tmp / oh; + int zC = z / 8; + int zR = z % 8; int ix = x * strideX - padX; int iy = y * strideY - padY; int sx = max(0, -ix); int sy = max(0, -iy); int ex = min(kernelX, iw - ix); int ey = min(kernelY, ih - iy); - T sumValue = (T)0; + float div = (float)(ey-sy)* (float)(ex-sx); + half2 sumValue = HALF2_MIN; for (int fy=sy; fy -__global__ void maxpool(const T* uInput, T* uOutput, - int bc, - int ih, int iw, - int oh, int ow, - int padX, int padY, - int kernelX, int kernelY, - int strideX, int strideY - ) { - int total = bc * oh * ow; + +__global__ void avgpool_halfC16(const half* uInput, half* uOutput, + int bc, + int ih, int iw, + int oh, int ow, + int padX, int padY, + int kernelX, int kernelY, + int strideX, int strideY + ) { + int total = bc * oh * ow * 8; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { int x = i % ow; int tmp = i / ow; int y = tmp % oh; int z = tmp / oh; + int zC = z / 8; + int zR = z % 8; int ix = x * strideX - padX; int iy = y * strideY - padY; int sx = max(0, -ix); int sy = max(0, -iy); int ex = min(kernelX, iw - ix); int ey = min(kernelY, ih - iy); - T maxValue = (T)(-1000000); + float div = (float)(ey-sy)* (float)(ex-sx); + half2 sumValue = half2(0.0f, 0.0f); + half2 mulValue = half2(1.0f / div, 1.0f/div); for (int fy=sy; fy &inputs, const std::vector &outputs) { auto layer = mParameter; int strideWidth = layer->strideX(); @@ -128,34 +247,62 @@ ErrorCode PoolExecution::onResize(const std::vector &inputs, const std ErrorCode PoolExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { auto iw = inputs[0]->width(); auto ih = inputs[0]->height(); - auto bc = inputs[0]->batch() * inputs[0]->channel(); + auto bc = inputs[0]->batch() * UP_DIV(inputs[0]->channel(), PACK_NUMBER); auto ow = outputs[0]->width(); auto oh = outputs[0]->height(); auto runtime = static_cast(backend())->getCUDARuntime(); - int block_num = runtime->blocks_num(bc * ow * oh); - int threads_num = runtime->threads_num(); + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int block_num = prop.multiProcessorCount; + if (static_cast(backend())->useFp16()) { + auto inputPtr = (const half*)inputs[0]->deviceId(); + auto outputPtr = (half*)outputs[0]->deviceId(); + switch (mPoolType) { + case PoolType_AVEPOOL: + avgpool_halfC16<<>>(inputPtr, outputPtr, + bc, + ih, iw, + oh, ow, + mPaddings[0], mPaddings[1], + mKernels[0], mKernels[1], + mStrides[0], mStrides[1] + ); + return NO_ERROR; + case PoolType_MAXPOOL: + maxpool_halfC16<<>>(inputPtr, outputPtr, + bc, + ih, iw, + oh, ow, + mPaddings[0], mPaddings[1], + mKernels[0], mKernels[1], + mStrides[0], mStrides[1] + ); + return NO_ERROR; + } + return NO_ERROR; + } auto inputPtr = (const float*)inputs[0]->deviceId(); auto outputPtr = (float*)outputs[0]->deviceId(); switch (mPoolType) { case PoolType_AVEPOOL: - avgpool<<>>(inputPtr, outputPtr, + avgpool_floatC16<<>>(inputPtr, outputPtr, bc, ih, iw, oh, ow, mPaddings[0], mPaddings[1], mKernels[0], mKernels[1], mStrides[0], mStrides[1] - ); + ); return NO_ERROR; case PoolType_MAXPOOL: - maxpool<<>>(inputPtr, outputPtr, + maxpool_floatC16<<>>(inputPtr, outputPtr, bc, ih, iw, oh, ow, mPaddings[0], mPaddings[1], mKernels[0], mKernels[1], mStrides[0], mStrides[1] - ); + ); return NO_ERROR; } return NOT_SUPPORT; diff --git a/source/backend/cuda/execution/Raster.cu b/source/backend/cuda/execution/Raster.cu index ec0b4d07..2fcd479c 100644 --- a/source/backend/cuda/execution/Raster.cu +++ b/source/backend/cuda/execution/Raster.cu @@ -1,89 +1,22 @@ #include "Raster.cuh" #include "TensorflowOp_generated.h" +#include +#include "MNNCUDAFunction.cuh" + namespace MNN { namespace CUDA { -template -__global__ void pack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) { - int total = inside * axis * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { - int x = i % inside; - int tmp = i / inside; - int y = tmp % axis; - int z = tmp / axis; - int y4 = y / 4; - int yR = y % 4; - int dstOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR; - output[dstOffset] = input[i]; - } -} - -void PackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) { - auto packAxis = (axis + 3) / 4; - if (axis % 4 != 0) { - runtime->memset(output, 0, inside * packAxis * 4 * outside * bytes); - } - int block_num = runtime->blocks_num(inside * axis * outside); - int threads_num = runtime->threads_num(); - switch (bytes) { - case 4: - pack_c4<<>>((const float*)input, (float*)output, inside, axis, outside, packAxis); - break; - case 2: - pack_c4<<>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis); - break; - case 1: - pack_c4<<>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis); - break; - default: - break; - } -} - -template -__global__ void unpack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) { - int total = inside * axis * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { - int x = i % inside; - int tmp = i / inside; - int y = tmp % axis; - int z = tmp / axis; - int y4 = y / 4; - int yR = y % 4; - int srcOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR; - output[i] = input[srcOffset]; - } -} -void UnpackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) { - auto packAxis = (axis + 3) / 4; - int block_num = runtime->blocks_num(inside * axis * outside); - int threads_num = runtime->threads_num(); - switch (bytes) { - case 4: - unpack_c4<<>>((const float*)input, (float*)output, inside, axis, outside, packAxis); - break; - case 2: - unpack_c4<<>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis); - break; - case 1: - unpack_c4<<>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis); - break; - default: - break; - } -} - // Blit don't care offset template __global__ void blitRegion(const T *inputO, T *outputO, - int loopCount, - const int32_t* dstIndice, const int32_t* srcIndice, - int dstUseIndice, int srcUseIndice, - int dstStep, int srcStep,int srcLimit, - int sizeZ, int sizeY, int sizeX, - int strideZ, int strideY, int strideX, - int dstStrideZ, int dstStrideY, int dstStrideX - ) { + int loopCount, + const int32_t* dstIndice, const int32_t* srcIndice, + int dstUseIndice, int srcUseIndice, + int dstStep, int srcStep,int srcLimit, + int sizeZ, int sizeY, int sizeX, + int strideZ, int strideY, int strideX, + int dstStrideZ, int dstStrideY, int dstStrideX + ) { int total = loopCount; for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { int srcOffsetO = i * srcStep; @@ -162,29 +95,66 @@ void BlitWithIndice(uint8_t* output, const uint8_t* input, const int32_t* dstInd #define UNARY_FUNC(Name, Func)\ template\ __global__ void Name(const T *input, T *output,\ - int sizeZ, int sizeY, int sizeX,\ + int count,\ + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\ int strideZ, int strideY, int strideX,\ int dstStrideZ, int dstStrideY, int dstStrideX\ ) { \ - int count = sizeZ * sizeY * sizeX;\ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\ - int total = sizeZ * sizeY * sizeX;\ - int ix = i % sizeX;\ - int tmp = i / sizeX;\ - int iy = tmp % sizeY;\ - int iz = tmp / sizeY;\ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\ + int ix, tmp, iy, iz;\ + sizeX.divmod(i, tmp, ix);\ + sizeY.divmod(tmp, iz, iy);\ int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\ int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\ T x = input[srcOffset];\ output[dstOffset] = Func;\ }\ }\ +template\ +__global__ void FLOAT##Name(const T *input, T *output,\ + int count,\ + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\ + int strideZ, int strideY, int strideX,\ + int dstStrideZ, int dstStrideY, int dstStrideX\ + ) { \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\ + int ix, tmp, iy, iz;\ + sizeX.divmod(i, tmp, ix);\ + sizeY.divmod(tmp, iz, iy);\ + int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\ + int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\ + float x = (float)input[srcOffset];\ + output[dstOffset] = (float)(Func);\ + }\ +}\ + +template +__global__ void blit_2(const T *input, T *output, + int count, + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX, + int strideZ, int strideY, + int dstStrideZ, int dstStrideY + ) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) { + int ix, tmp, iy, iz; + sizeX.divmod(i, tmp, ix); + sizeY.divmod(tmp, iz, iy); + int srcOffset = iz * strideZ + iy * strideY + (ix << 1); + int dstOffset = iz * dstStrideZ + iy * dstStrideY + (ix << 1); + int2 * dstF = (int2 *)(output+dstOffset); + dstF[0] = ((int2 *)(input+srcOffset))[0]; + } +} + +struct Bytes512 { + int4 x[4]; +}; UNARY_FUNC(blit, x); UNARY_FUNC(ABS, abs(x)); UNARY_FUNC(EXP, exp(x)); UNARY_FUNC(NEG, -x); -UNARY_FUNC(RECIPROCAL, (T)(1.0)/x); +UNARY_FUNC(RECIPROCAL, (1.0)/x); UNARY_FUNC(FLOOR, floor(x)); UNARY_FUNC(CEIL, ceil(x)); UNARY_FUNC(SQUARE, x*x); @@ -212,27 +182,68 @@ UNARY_FUNC(HARDSWISH, 1.0/6.0 * x * min(max(x+3.0, 0.0), 6.0)); UNARY_FUNC(ERF, erf(x)); UNARY_FUNC(ERFC, erfc(x)); UNARY_FUNC(ERFINV, erfinv(x)); +UNARY_FUNC(GELU, (1.0f + tanh(0.79788458f * (0.044715f * x * x * x + x))) * x * 0.5f); +UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5); void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) { int count = size[0] * size[1] * size[2]; + + DivModFast sz(size[0]); + DivModFast sy(size[1]); + DivModFast sx(size[2]); + + //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); + if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) { + //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); + count /= 2; + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + DivModFast sx_2((size[2]/2)); + + blit_2<<>>((const float*)input, (float*)output, + count, + sz, sy, sx_2, + srcStride[0], srcStride[1], + dstStride[0], dstStride[1]); + return; + } + int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); + switch (bytes) { + case 64: + blit<<>>((const Bytes512*)input, (Bytes512*)output, + count, + sz, sy, sx, + srcStride[0], srcStride[1], srcStride[2], + dstStride[0], dstStride[1], dstStride[2]); + break; + case 32: + blit<<>>((const double4*)input, (double4*)output, + count, + sz, sy, sx, + srcStride[0], srcStride[1], srcStride[2], + dstStride[0], dstStride[1], dstStride[2]); + break; case 4: blit<<>>((const float*)input, (float*)output, - size[0], size[1], size[2], + count, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; case 2: blit<<>>((const int16_t*)input, (int16_t*)output, - size[0], size[1], size[2], + count, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; case 1: blit<<>>((const int8_t*)input, (int8_t*)output, - size[0], size[1], size[2], + count, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; @@ -241,59 +252,131 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons } } -template -__global__ void fuseblit(const T *input, T *output, - int fuseNum, const int32_t* sliceOffset, - int sizeZ, int sizeY, int sizeX, - int strideZ, int strideY, int strideX, - int dstStrideZ, int dstStrideY, int dstStrideX - ) { - int count = fuseNum*sizeZ * sizeY * sizeX; - - for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) { - int j = c / (sizeZ * sizeY * sizeX); - int i = c % (sizeZ * sizeY * sizeX); - int ix = i % sizeX; - int tmp = i / sizeX; - int iy = tmp % sizeY; - int iz = tmp / sizeY; +template +__global__ void fuseblit(const T0 *input, T1 *output, + int fuseNum, int count, const int32_t* sliceOffset, + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX, + int strideZ, int strideY, int strideX, + int dstStrideZ, int dstStrideY, int dstStrideX + ) { + size_t c = blockIdx.x * blockDim.x + threadIdx.x; + for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) { + int ix, tmp, iy, tmp2, iz, j; + sizeX.divmod(c, tmp, ix); + sizeY.divmod(tmp, tmp2, iy); + sizeZ.divmod(tmp2, j, iz); int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + ix * strideX; int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX; output[dst_offset] = input[src_offset]; } +} +template +__global__ void fuseblit_4(const T0 *input, T1 *output, + int fuseNum, int count, const int32_t* sliceOffset, + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX, + int strideZ, int strideY, + int dstStrideZ, int dstStrideY + ) { + for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) { + int ix, tmp, iy, tmp2, iz, j; + sizeX.divmod(c, tmp, ix); + sizeY.divmod(tmp, tmp2, iy); + sizeZ.divmod(tmp2, j, iz); + int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2); + int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2); + int4* srcF = (int4 *)(input + src_offset); + int4* dstF = (int4 *)(output + dst_offset); + dstF[0] = srcF[0]; + } +} +template +__global__ void fuseblit_half_4(const T0 *input, T1 *output, + int fuseNum, int count, const int32_t* sliceOffset, + DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX, + int strideZ, int strideY, + int dstStrideZ, int dstStrideY + ) { + for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) { + int ix, tmp, iy, tmp2, iz, j; + sizeX.divmod(c, tmp, ix); + sizeY.divmod(tmp, tmp2, iy); + sizeZ.divmod(tmp2, j, iz); + int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2); + int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2); + int2* srcF = (int2 *)(input + src_offset); + int2* dstF = (int2 *)(output + dst_offset); + dstF[0] = srcF[0]; + } } void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime) { - int count = size[0] * size[1] * size[2]; + DivModFast sz(size[0]); + DivModFast sy(size[1]); + DivModFast sx(size[2]); + + int count = fuseNum * size[0] * size[1] * size[2]; + if(size[2] % 4 == 0 && count > 16384 && srcStride[2] == 1 && dstStride[2] == 1) { + //printf("%d-%d-%d, %d-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], dstStride[0], dstStride[1]); + int count = fuseNum * size[0] * size[1] * size[2] / 4; + int numBlocks = runtime->blocks_num(count); + int threadsPerBlock = runtime->threads_num(); + DivModFast sx_4((size[2]/4)); + + if(bytes == 4) { + fuseblit_4<<>>((const float*)input, (float*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx_4, + srcStride[0], srcStride[1], + dstStride[0], dstStride[1]); + return; + } else if(bytes == 2){ + fuseblit_half_4<<>>((const half*)input, (half*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx_4, + srcStride[0], srcStride[1], + dstStride[0], dstStride[1]); + return; + } + } + int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); - int numBlocks = block_num; - int threadsPerBlock = threads_num; - // dim3 numBlocks(block_num, fuseNum); - // dim3 threadsPerBlock(threads_num, 1); - switch (bytes) { + case 64: + fuseblit<<>>((const Bytes512*)input, (Bytes512*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx, + srcStride[0], srcStride[1], srcStride[2], + dstStride[0], dstStride[1], dstStride[2]); + break; + case 16: + fuseblit<<>>((const int4*)input, (int4*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx, + srcStride[0], srcStride[1], srcStride[2], + dstStride[0], dstStride[1], dstStride[2]); + break; case 4: - fuseblit<<>>((const float*)input, (float*)output, - fuseNum, (const int32_t*)sliceOffset, - size[0], size[1], size[2], + fuseblit<<>>((const float*)input, (float*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; case 2: - fuseblit<<>>((const int16_t*)input, (int16_t*)output, - fuseNum, (const int32_t*)sliceOffset, - size[0], size[1], size[2], + fuseblit<<>>((const int16_t*)input, (int16_t*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; case 1: - fuseblit<<>>((const int8_t*)input, (int8_t*)output, - fuseNum, (const int32_t*)sliceOffset, - size[0], size[1], size[2], + fuseblit<<>>((const int8_t*)input, (int8_t*)output, + fuseNum, count, (const int32_t*)sliceOffset, + sz, sy, sx, srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]); break; @@ -303,18 +386,112 @@ void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, //printf("%s, %d-%d-%d-%d\n", cudaGetErrorString(cudaGetLastError()), numBlocks.x, numBlocks.y, threadsPerBlock.x, threadsPerBlock.y); } +template +__global__ void fuseblitLimit(const T0 *input, T1 *output, + const FuseRegion* info, const int32_t* sliceOffset + ) { + int sizeZ = info->size[0]; + int sizeY = info->size[1]; + int sizeX = info->size[2]; + int strideZ = info->srcStride[0]; + int strideY = info->srcStride[1]; + int strideX = info->srcStride[2]; + int dstStrideZ = info->dstStride[0]; + int dstStrideY = info->dstStride[1]; + int dstStrideX = info->dstStride[2]; + int fuseNum = info->fuseNumber; + + int count = fuseNum*sizeZ * sizeY * sizeX; + + for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) { + int j = c / (sizeZ * sizeY * sizeX); + int i = c % (sizeZ * sizeY * sizeX); + int ix = i % sizeX; + int tmp = i / sizeX; + int iy = tmp % sizeY; + int iz = tmp / sizeY; + const int* srcOffsetPtr = sliceOffset + 8 * j; + const int* dstOffsetPtr = sliceOffset + 8 * j + 4; + T0 srcValue = (T0)0; + int src_offset = srcOffsetPtr[3] + iz * strideZ + iy * strideY + ix * strideX; + if (srcOffsetPtr[0] > iz && srcOffsetPtr[1] > iy && srcOffsetPtr[2] > ix) { + srcValue = input[src_offset]; + } + int dst_offset = dstOffsetPtr[3] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX; + //printf("%d -> %d - %f\n", src_offset, dst_offset, srcValue); + if (dstOffsetPtr[0] > iz && dstOffsetPtr[1] > iy && dstOffsetPtr[2] > ix) { + output[dst_offset] = srcValue; + } + } +} +void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int block_num = prop.multiProcessorCount; + fuseblitLimit<<>>((const float*)input, (half*)output, + info, (const int32_t*)sliceOffset); +} +void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int block_num = prop.multiProcessorCount; + fuseblitLimit<<>>((const half*)input, (float*)output, + info, (const int32_t*)sliceOffset); +} +void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int block_num = prop.multiProcessorCount; + fuseblitLimit<<>>((const float*)input, (float*)output, + info, (const int32_t*)sliceOffset); +} + +void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes) { + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int block_num = prop.multiProcessorCount; + switch (bytes) { + case 4: + fuseblitLimit<<>>((const float*)input, (float*)output, + info, (const int32_t*)sliceOffset); + break; + case 2: + fuseblitLimit<<>>((const half*)input, (half*)output, + info, (const int32_t*)sliceOffset); + break; + case 1: + fuseblitLimit<<>>((const int8_t*)input, (int8_t*)output, + info, (const int32_t*)sliceOffset); + break; + default: + break; + } +} + + void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) { int count = size[0] * size[1] * size[2]; int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); + DivModFast sz(size[0]); + DivModFast sy(size[1]); + DivModFast sx(size[2]); // TODO: Support FP16 - MNN_ASSERT(bytes==4); #define COMPUTE(TYPE)\ if (opType == MNN::UnaryOpOperation_##TYPE ) {\ - TYPE<<>>((const float*)input, (float*)output,\ - size[0], size[1], size[2],\ + if(bytes==2) {\ + FLOAT##TYPE<<>>((const half*)input, (half*)output,\ + count, \ + sz, sy, sx,\ srcStride[0], srcStride[1], srcStride[2],\ dstStride[0], dstStride[1], dstStride[2]);\ + } else {\ + TYPE<<>>((const float*)input, (float*)output,\ + count, \ + sz, sy, sx,\ + srcStride[0], srcStride[1], srcStride[2],\ + dstStride[0], dstStride[1], dstStride[2]);\ + }\ return;\ }\ @@ -330,6 +507,8 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const COMPUTE(SIN); COMPUTE(COS); COMPUTE(TAN); + COMPUTE(GELU); + COMPUTE(GELU_STANDARD); COMPUTE(ASIN); COMPUTE(ACOS); COMPUTE(ATAN); @@ -356,26 +535,126 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const #define BINARY_FUNC(Name, Func)\ template\ __global__ void Binary##Name(\ - const TIn *input0, const TIn* input1, TOut *output,\ - int sizeZ, int sizeY, int sizeX,\ - int strideZ, int strideY, int strideX,\ - int strideZ1, int strideY1, int strideX1,\ - int dstStrideZ, int dstStrideY, int dstStrideX\ - ) { \ - int count = sizeZ * sizeY * sizeX;\ - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\ - int total = sizeZ * sizeY * sizeX;\ - int ix = i % sizeX;\ - int tmp = i / sizeX;\ - int iy = tmp % sizeY;\ - int iz = tmp / sizeY;\ - int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\ - int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\ - int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\ - TIn x = input0[srcOffset];\ - TIn y = input1[srcOffset1];\ - output[dstOffset] = (TOut)Func;\ - }\ + const TIn *input0, const TIn* input1, TOut *output,\ + int sizeZ, int sizeY, int sizeX,\ + int strideZ, int strideY, int strideX,\ + int strideZ1, int strideY1, int strideX1,\ + int dstStrideZ, int dstStrideY, int dstStrideX\ + ) { \ + int count = sizeZ * sizeY * sizeX;\ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\ + int total = sizeZ * sizeY * sizeX;\ + int ix = i % sizeX;\ + int tmp = i / sizeX;\ + int iy = tmp % sizeY;\ + int iz = tmp / sizeY;\ + int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\ + int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\ + int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\ + TIn x = input0[srcOffset];\ + TIn y = input1[srcOffset1];\ + output[dstOffset] = (TOut)Func;\ + }\ +}\ + +#define BINARY_FUNC_FLOATMID(Name, Func)\ +template\ +__global__ void BinaryMid##Name(\ + const TIn *input0, const TIn* input1, TOut *output,\ + int sizeZ, int sizeY, int sizeX,\ + int strideZ, int strideY, int strideX,\ + int strideZ1, int strideY1, int strideX1,\ + int dstStrideZ, int dstStrideY, int dstStrideX\ + ) { \ + int count = sizeZ * sizeY * sizeX;\ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\ + int total = sizeZ * sizeY * sizeX;\ + int ix = i % sizeX;\ + int tmp = i / sizeX;\ + int iy = tmp % sizeY;\ + int iz = tmp / sizeY;\ + int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\ + int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\ + int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\ + float x = input0[srcOffset];\ + float y = input1[srcOffset1];\ + output[dstOffset] = (TOut)(Func);\ + }\ +}\ +template\ +__global__ void BinaryMidLinear##Name(\ + const TIn *input0, const TIn* input1, TOut *output,\ + int sizeZ,\ + int strideZ,\ + int strideZ1,\ + int dstStrideZ\ + ) { \ + int count = sizeZ;\ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\ + int iz = i;\ + int srcOffset = iz * strideZ;\ + int srcOffset1 = iz * strideZ1;\ + int dstOffset = iz * dstStrideZ;\ + float x = input0[srcOffset];\ + float y = input1[srcOffset1];\ + output[dstOffset] = (TOut)(Func);\ + }\ +}\ + +#define BINARY_FUNC_FLOATMID4(Name, Func)\ +template\ +__global__ void BinaryMidLinear4_##Name(\ + const TIn *input0, const TIn* input1, TOut *output,\ + int count_4\ + ) { \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\ + int iz = i;\ + int srcOffset = iz << 2;\ + int srcOffset1 = iz << 2;\ + int dstOffset = iz << 2;\ + float4 xx = ((float4 *)(input0+srcOffset))[0];\ + float4 yy = ((float4 *)(input1+srcOffset1))[0];\ + float x = xx.x;\ + float y = yy.x;\ + output[dstOffset] = (TOut)(Func);\ + x = xx.y;\ + y = yy.y;\ + output[dstOffset+1] = (TOut)(Func);\ + x = xx.z;\ + y = yy.z;\ + output[dstOffset+2] = (TOut)(Func);\ + x = xx.w;\ + y = yy.w;\ + output[dstOffset+3] = (TOut)(Func);\ + }\ +}\ +template\ +__global__ void BinaryMidLinearHalf4_##Name(\ + const TIn *input0, const TIn* input1, TOut *output,\ + int count_4\ + ) { \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\ + int iz = i;\ + int srcOffset = iz << 2;\ + int srcOffset1 = iz << 2;\ + int dstOffset = iz << 2;\ + half2 xx = ((half2 *)(input0+srcOffset))[0];\ + half2 yy = ((half2 *)(input1+srcOffset1))[0];\ + float x = (float)xx.x;\ + float y = (float)yy.x;\ + output[dstOffset] = (TOut)(Func);\ + x = (float)xx.y;\ + y = (float)yy.y;\ + output[dstOffset+1] = (TOut)(Func);\ + xx = ((half2 *)(input0+srcOffset))[1];\ + yy = ((half2 *)(input1+srcOffset1))[1];\ + x = (float)xx.x;\ + y = (float)yy.x;\ + output[dstOffset+2] = (TOut)(Func);\ + x = (float)xx.y;\ + y = (float)yy.y;\ + output[dstOffset+3] = (TOut)(Func);\ + }\ }\ #define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0)) @@ -398,44 +677,107 @@ BINARY_FUNC(FLOORMOD, x - floor(x / y) * y); BINARY_FUNC(SquaredDifference, (x-y)*(x-y)); BINARY_FUNC(POW, pow(x, y)); BINARY_FUNC(ATAN2, atan2(x, y)); -BINARY_FUNC(MOD, x - x / y); +BINARY_FUNC(MOD, (x % y)); BINARY_FUNC(LOGICALOR, (x || y) ? 1 : 0); -void BinaryBlitTemplateFloat(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) { +BINARY_FUNC_FLOATMID(ADD, x+y); +BINARY_FUNC_FLOATMID(SUB, x-y); +BINARY_FUNC_FLOATMID(MUL, x*y); +BINARY_FUNC_FLOATMID(DIV, x/y); +BINARY_FUNC_FLOATMID(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001)); +BINARY_FUNC_FLOATMID(MINIMUM, min(x, y)); +BINARY_FUNC_FLOATMID(MAXIMUM, max(x, y)); +BINARY_FUNC_FLOATMID(GREATER, x > y ? 1 : 0); +BINARY_FUNC_FLOATMID(LESS, x < y ? 1 : 0); +BINARY_FUNC_FLOATMID(LESS_EQUAL, x <= y ? 1 : 0); +BINARY_FUNC_FLOATMID(GREATER_EQUAL, x >= y ? 1 : 0); +BINARY_FUNC_FLOATMID(EQUAL, x == y ? 1 : 0); +BINARY_FUNC_FLOATMID(NOTEQUAL, x != y ? 1 : 0); +BINARY_FUNC_FLOATMID(FLOORDIV, floor(x / y)); +BINARY_FUNC_FLOATMID(FLOORMOD, x - floor(x / y) * y); +BINARY_FUNC_FLOATMID(SquaredDifference, (x-y)*(x-y)); +BINARY_FUNC_FLOATMID(POW, pow(x, y)); +BINARY_FUNC_FLOATMID(ATAN2, atan2(x, y)); +BINARY_FUNC_FLOATMID(MOD, fmod(x, y)); +BINARY_FUNC_FLOATMID(LOGICALOR, (x || y) ? 1 : 0); + +BINARY_FUNC_FLOATMID4(ADD, x+y); +BINARY_FUNC_FLOATMID4(SUB, x-y); +BINARY_FUNC_FLOATMID4(MUL, x*y); +BINARY_FUNC_FLOATMID4(DIV, x/y); +BINARY_FUNC_FLOATMID4(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001)); +BINARY_FUNC_FLOATMID4(MINIMUM, min(x, y)); +BINARY_FUNC_FLOATMID4(MAXIMUM, max(x, y)); +BINARY_FUNC_FLOATMID4(GREATER, x > y ? 1 : 0); +BINARY_FUNC_FLOATMID4(LESS, x < y ? 1 : 0); +BINARY_FUNC_FLOATMID4(LESS_EQUAL, x <= y ? 1 : 0); +BINARY_FUNC_FLOATMID4(GREATER_EQUAL, x >= y ? 1 : 0); +BINARY_FUNC_FLOATMID4(EQUAL, x == y ? 1 : 0); +BINARY_FUNC_FLOATMID4(NOTEQUAL, x != y ? 1 : 0); +BINARY_FUNC_FLOATMID4(FLOORDIV, floor(x / y)); +BINARY_FUNC_FLOATMID4(FLOORMOD, x - floor(x / y) * y); +BINARY_FUNC_FLOATMID4(SquaredDifference, (x-y)*(x-y)); +BINARY_FUNC_FLOATMID4(POW, pow(x, y)); +BINARY_FUNC_FLOATMID4(ATAN2, atan2(x, y)); +BINARY_FUNC_FLOATMID4(MOD, fmod(x, y)); +BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0); + +template +void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) { int count = size[0] * size[1] * size[2]; int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); - // TODO: Support FP16 - MNN_ASSERT(bytes==4); #define COMPUTE_FLOAT(TYPE, TOut)\ - if (opType == MNN::BinaryOpOperation_##TYPE ) {\ - Binary##TYPE<<>>((const float*)input, (const float*)(input1), (TOut*)output,\ - size[0], size[1], size[2],\ - srcStride[0], srcStride[1], srcStride[2],\ - srcStride1[0], srcStride1[1], srcStride1[2],\ - dstStride[0], dstStride[1], dstStride[2]);\ - return;\ - }\ + if (opType == MNN::BinaryOpOperation_##TYPE ) {\ + if (size[2] == count) {\ + if(count % 4 == 0 && count > 16384 && srcStride[2] == 1 && srcStride1[2] == 1 && dstStride[2] == 1) {\ + block_num = runtime->blocks_num(count/4);\ + threads_num = runtime->threads_num();\ + if(bytes == 4) {\ + BinaryMidLinear4_##TYPE<<>>((const T*)input, (const T*)(input1), (TOut*)output,\ + count/4);\ + } else {\ + BinaryMidLinearHalf4_##TYPE<<>>((const T*)input, (const T*)(input1), (TOut*)output,\ + count/4);\ + }\ + } else {\ + BinaryMidLinear##TYPE<<>>((const T*)input, (const T*)(input1), (TOut*)output,\ + size[2],\ + srcStride[2],\ + srcStride1[2],\ + dstStride[2]);\ + }\ + } else {\ + BinaryMid##TYPE<<>>((const T*)input, (const T*)(input1), (TOut*)output,\ + size[0], size[1], size[2],\ + srcStride[0], srcStride[1], srcStride[2],\ + srcStride1[0], srcStride1[1], srcStride1[2],\ + dstStride[0], dstStride[1], dstStride[2]);\ + }\ + return;\ + }\ - COMPUTE_FLOAT(ADD, float); - COMPUTE_FLOAT(SUB, float); - COMPUTE_FLOAT(MUL, float); - COMPUTE_FLOAT(DIV, float); - COMPUTE_FLOAT(REALDIV, float); - COMPUTE_FLOAT(MINIMUM, float); - COMPUTE_FLOAT(MAXIMUM, float); + COMPUTE_FLOAT(ADD, T); + COMPUTE_FLOAT(SUB, T); + COMPUTE_FLOAT(MUL, T); + COMPUTE_FLOAT(DIV, T); + COMPUTE_FLOAT(REALDIV, T); + COMPUTE_FLOAT(MINIMUM, T); + COMPUTE_FLOAT(MAXIMUM, T); COMPUTE_FLOAT(GREATER, int); COMPUTE_FLOAT(LESS, int); COMPUTE_FLOAT(LESS_EQUAL, int); COMPUTE_FLOAT(GREATER_EQUAL, int); COMPUTE_FLOAT(EQUAL, int); COMPUTE_FLOAT(NOTEQUAL, int); - COMPUTE_FLOAT(FLOORDIV, float); - COMPUTE_FLOAT(FLOORMOD, float); - COMPUTE_FLOAT(POW, float); - COMPUTE_FLOAT(SquaredDifference, float); - COMPUTE_FLOAT(ATAN2, float); - COMPUTE_FLOAT(MOD, float); + COMPUTE_FLOAT(FLOORDIV, T); + COMPUTE_FLOAT(FLOORMOD, T); + COMPUTE_FLOAT(POW, T); + COMPUTE_FLOAT(SquaredDifference, T); + COMPUTE_FLOAT(ATAN2, T); + COMPUTE_FLOAT(MOD, T); + + #undef COMPUTE_FLOAT } void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) { @@ -472,12 +814,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_ void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) { if (type.code == halide_type_float) { - BinaryBlitTemplateFloat(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType); + if (type.bits == 32) { + BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType); + } else if (type.bits == 16) { + BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType); + } } else if (type.code == halide_type_int) { BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType); } } - }// namespace CUDA }// namespace MNN diff --git a/source/backend/cuda/execution/Raster.cuh b/source/backend/cuda/execution/Raster.cuh index 701aee72..b03be095 100644 --- a/source/backend/cuda/execution/Raster.cuh +++ b/source/backend/cuda/execution/Raster.cuh @@ -6,11 +6,22 @@ namespace MNN { namespace CUDA { void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime); void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime); - void PackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime); - void UnpackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime); void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime); void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType); void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType); + + // Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst + struct FuseRegion { + int32_t size[3] = {1, 1, 1}; + int32_t srcStride[3] = {0, 0, 0}; + int32_t dstStride[3] = {0, 0, 0}; + int fuseNumber = 0; + }; + void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime); + void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime); + void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime); + void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes); + } } diff --git a/source/backend/cuda/execution/RasterExecution.cpp b/source/backend/cuda/execution/RasterExecution.cpp index 92fba702..91946914 100644 --- a/source/backend/cuda/execution/RasterExecution.cpp +++ b/source/backend/cuda/execution/RasterExecution.cpp @@ -2,35 +2,305 @@ // RasterExecution.cpp // MNN // -// Created by MNN on 2020/07/30. +// Created by MNN on b'2020/04/02'. // Copyright © 2018, Alibaba Group Holding Limited // #include "RasterExecution.hpp" -#include "Raster.cuh" -#include "core/Concurrency.h" #include "core/OpCommonUtils.hpp" +#include "core/BufferAllocator.hpp" +#include "Raster.cuh" +#include "Transpose.cuh" +#include "MNNCUDADefine.hpp" namespace MNN { namespace CUDA { -ErrorCode RasterExecution::onResize(const std::vector& inputs, const std::vector& outputs) { +static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) { + batch = t->batch(); + if (t->dimensions() == 4) { + channel = t->channel(); + area = t->width() * t->height(); + } else if (t->dimensions() == 3) { + auto format = TensorUtils::getDescribe(t)->dimensionFormat; + if (format == MNN_DATA_FORMAT_NHWC) { + channel = t->length(2); + area = t->length(1); + } else { + channel = t->length(1); + area = t->length(2); + } + } else { + auto format = TensorUtils::getDescribe(t)->dimensionFormat; + if (format == MNN_DATA_FORMAT_NHWC) { + for (int i = t->dimensions() - 1; i > 0; i--) { + int len = t->length(i); + if (len > 1) { + if (channel == 1) { + channel = len; + } else { + area *= len; + } + } + } + } else { + for (int i = 1; i < t->dimensions(); i++) { + int len = t->length(i); + if (len > 1) { + if (channel == 1) { + channel = len; + } else { + area *= len; + } + } + } + } + } +} +// Detect if the region is a transpose +static bool _transpose(const Tensor::InsideDescribe::Region& region) { + int srcOne = -1, dstOne = -1; + for (int i = 0; i < 3; i++) { + if (region.src.stride[i] == 1 && region.size[i] != 1) { + if (srcOne >= 0 || region.size[i] < 4) { + return false; + } + srcOne = i; + } + if (region.dst.stride[i] == 1 && region.size[i] != 1) { + if (dstOne >= 0 || region.size[i] < 4) { + return false; + } + dstOne = i; + } + } + return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne; +} + +static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) { + auto origin = region.origin; + auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat; + auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat; + if (srcFormat == dstFormat) { + return 0; + } + if (0 != region.src.offset || 0 != region.dst.offset) { + return 0; + } + int dstBatch = 1, dstChannel = 1, dstArea = 1, + srcBatch = 1, srcChannel = 1, srcArea = 1; + getBatchChannelArea(origin, srcBatch, srcChannel, srcArea); + getBatchChannelArea(dest, dstBatch, dstChannel, dstArea); + if (dstBatch != srcBatch) { + return 0; + } + if (dstChannel != srcChannel) { + return 0; + } + if (dstArea != srcArea) { + return 0; + } + auto totalSize = dstBatch * dstChannel * dstArea; + int srcSize = 1; + int dstSize = 1; + int res = 1; + for (int i=0; i<3; ++i) { + if (region.size[i] == 1) { + continue; + } + if (region.src.stride[i] != region.dst.stride[i]) { + if (dstArea == 1) { + // Batch / Channel transpose + return 0; + } + res = 2; + } + srcSize += (region.size[i] - 1) * region.src.stride[i]; + dstSize += (region.size[i] - 1) * region.dst.stride[i]; + } + if (srcSize != totalSize || dstSize != totalSize ) { + return 0; + } + // Check If it can be described as NHWC <-> NC4HW4 transpose + if (2 == res) { + int srcChannelStride; + int dstChannelStride; + int srcAreaStride; + int dstAreaStride; + if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) { + srcChannelStride = srcArea; + srcAreaStride = 1; + dstChannelStride = 1; + dstAreaStride = srcChannel; + } else { + srcChannelStride = 1; + srcAreaStride = srcChannel; + dstAreaStride = 1; + dstChannelStride = srcArea; + } + for (int i=0; i<3; ++i) { + if (region.size[i] == 1) { + continue; + } + if (region.size[i] == dstBatch) { + if (region.src.stride[i] != region.dst.stride[i]) { + return 0; + } + continue; + } + if (region.size[i] == srcChannel) { + if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) { + return 0; + } + } + if (region.size[i] == srcArea) { + if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) { + return 0; + } + } + } + return 2; + } + return 1; +} + +ErrorCode RasterExecution::onResize(const std::vector &inputs, const std::vector &outputs) { MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); - auto input = inputs[0]; - auto output = outputs[0]; - auto des = TensorUtils::getDescribe(input); + auto input = inputs[0]; + auto output = outputs[0]; + auto des = TensorUtils::getDescribe(input); auto outputDes = TensorUtils::getDescribe(output); - mNeedZero = !TensorUtils::regionIsFull(input); - mTempInputCopy.clear(); - + mNeedZero = !TensorUtils::regionIsFull(input); + mZeroPoint = 0; + mTempInput.clear(); + mFastBlit.clear(); mFuseRaster.first = false; - if(des->regions.size() > 1) { - mFuseRaster.first = true; - mFuseRaster.second = des->regions.size(); - auto& slice0 = des->regions[0]; - for (int i = 1; i < des->regions.size(); ++i) { + mTempOutput = nullptr; + auto midFormat = MNN_DATA_FORMAT_NCHW; + mTempInputCopy.clear(); + mOutputPtr = output; + mFast = false; + int pack = PACK_NUMBER; + // all_srcFormat == dstFormat == NC4HW4 : Fast Exe + if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + mFast = true; + for (int i=0; i< des->regions.size(); ++i) { auto& slice = des->regions[i]; - if (slice0.origin->deviceId() != slice.origin->deviceId()) { + if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { + mFast = false; + break; + } + if (!OpCommonUtils::canBlitFast(slice, output, pack, true)) { + mFast = false; + break; + } + } + if (mFast) { + for (int i=0; i< des->regions.size(); ++i) { + auto& slice = des->regions[i]; + if (slice.origin == nullptr) { + continue; + } + Tensor::InsideDescribe::Region newRegion; + OpCommonUtils::turnToPackRegion(slice, newRegion, output, pack, true); + mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion))); + } + return NO_ERROR; + } + } + mSingleConvert = 0; + // srcNum == 1 && srcFormat != dstFormat : Single Convert + if (des->regions.size() == 1) { + mSingleConvert = _singleConvert(des->regions[0], output); + if (mSingleConvert > 0) { + return NO_ERROR; + } + } + // Acquire Buffer for temp output + // TODO: optimize it + if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) { + mTempOutput.reset(new Tensor); + TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat); + } + if (nullptr != mTempOutput) { + auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC); + if (!res) { + return OUT_OF_MEMORY; + } + mOutputPtr = mTempOutput.get(); + } + // input is NC4HW4 add Convert + std::vector forRelease; + for (int i=0; i< des->regions.size(); ++i) { + auto& slice = des->regions[i]; + auto origin = slice.origin; + if (slice.mask != 0) { + mTempInputCopy.emplace_back(std::make_pair(origin, &slice)); + continue; + } + // if tensor is not NC4HW4 or has been merged, don't need deal + if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { + mTempInputCopy.emplace_back(std::make_pair(origin, &slice)); + continue; + } + // if NC4HW4's C%4 == 0, change convert to transpose and fuse it + if (origin->batch() == 1 && origin->channel() % pack == 0) { + int channel = origin->channel(); + int area = 1; + // conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height + for (int d = 2; d < origin->dimensions(); d++) { + area *= origin->length(d); + } + Tensor::InsideDescribe::Region regionTmp; + regionTmp.src.offset = 0; + regionTmp.src.stride[0] = area * pack; + regionTmp.src.stride[1] = 1; + regionTmp.src.stride[2] = pack; + regionTmp.dst.offset = 0; + regionTmp.dst.stride[0] = area * pack; + regionTmp.dst.stride[1] = area; + regionTmp.dst.stride[2] = 1; + regionTmp.size[0] = channel / pack; + regionTmp.size[1] = pack; + regionTmp.size[2] = area; + regionTmp.origin = slice.origin; + bool merge = TensorUtils::fuseRegion(regionTmp, slice); + if (merge) { + // cache the merged tensor + slice.mask = 1; + mTempInputCopy.emplace_back(std::make_pair(origin, &slice)); + continue; + } + } + auto cache = static_cast(backend())->getCache(); + auto tempTensor = cache->findCacheTensor(origin, midFormat); + if (nullptr == tempTensor) { + std::shared_ptr newTensor(new Tensor); + TensorUtils::copyShape(origin, newTensor.get()); + TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat; + newTensor->buffer().type = origin->getType(); + TensorUtils::setLinearLayout(newTensor.get()); + mTempInput.insert(std::make_pair(origin, newTensor.get())); + auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC); + if (!res) { + return OUT_OF_MEMORY; + } + tempTensor = newTensor.get(); + TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount; + cache->pushCacheTensor(newTensor, origin, midFormat); + } + if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) { + forRelease.emplace_back(tempTensor); + } + mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice)); + } + if(mTempInputCopy.size() > 1) { + mFuseRaster.first = true; + mFuseRaster.second = mTempInputCopy.size(); + auto& slice0 = *mTempInputCopy[0].second; + for (int i = 1; i < mTempInputCopy.size(); ++i) { + auto& slice = *mTempInputCopy[i].second; + if (mTempInputCopy[i].first != mTempInputCopy[0].first) { mFuseRaster.first = false; break; } @@ -52,81 +322,141 @@ ErrorCode RasterExecution::onResize(const std::vector& inputs, const st } } } - //mFuseRaster.first = false; - if(!mFuseRaster.first) { - for (int i = 0; i < des->regions.size(); ++i) { - auto& slice = des->regions[i]; - if (nullptr == slice.origin) { - continue; - } - mTempInputCopy.emplace_back(std::make_pair((void*)slice.origin->deviceId(), &slice)); - } - } else { - auto& slice0 = des->regions[0]; - if (nullptr != slice0.origin) { - mTempInputCopy.emplace_back(std::make_pair((void*)slice0.origin->deviceId(), &slice0)); - } - - int regionSize = des->regions.size(); + if(mFuseRaster.first) { + auto& slice0 = *mTempInputCopy[0].second; + auto tensor = mTempInputCopy[0].first; + int regionSize = mTempInputCopy.size(); std::vector temp(2*regionSize, 0); for (int i = 0; i < regionSize; ++i) { - auto& slice = des->regions[i]; + auto& slice = *mTempInputCopy[i].second; temp[i] = slice.src.offset; temp[regionSize+i] = slice.dst.offset; - //printf("%d-", tmpSrc[i]); + //printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]); } //save srcOffset/dstOffset to Device offsetTensor.reset(Tensor::createDevice({2*regionSize})); backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC); mOffset = (void *)offsetTensor.get()->buffer().device; cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice)); + mTempInputCopy.clear(); + mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0)); + } + + for (auto t : forRelease) { + backend()->onReleaseBuffer(t, Backend::DYNAMIC); + } + if (nullptr != mTempOutput) { + backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC); } return NO_ERROR; } -ErrorCode RasterExecution::onExecute(const std::vector& inputs, const std::vector& outputs) { +void RasterExecution::executeFaster(const std::vector &inputs, const std::vector &outputs) const { + auto bn = static_cast(backend()); + auto input = inputs[0]; + auto output = outputs[0]; + auto bytes = bn->getBytes(output); auto runtime = static_cast(backend())->getCUDARuntime(); - auto input = inputs[0]; - auto output = outputs[0]; - auto bytes = input->getType().bytes(); if (mNeedZero) { - runtime->memset((void*)output->deviceId(), 0, output->size()); + auto size = static_cast(backend())->realSize(output) * bytes; + cudaMemset((uint8_t*)output->deviceId(), 0, size); + } + // Use mFastBlit + for (auto& iter : mFastBlit) { + auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes; + auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes; + RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes * PACK_NUMBER, runtime); + } +} + + +ErrorCode RasterExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { + if (mFast) { + executeFaster(inputs, outputs); + return NO_ERROR; + } + auto bn = static_cast(backend()); + auto input = inputs[0]; + auto output = outputs[0]; + auto bytes = bn->getBytes(output); + auto runtime = static_cast(backend())->getCUDARuntime(); + if (mSingleConvert > 0) { + auto realInput = TensorUtils::getDescribe(input)->regions[0].origin; + int srcBatch = 1, srcChannel = 1, srcArea = 1; + getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea); + auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat; + auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat; + int batchStride = srcChannel * srcArea * bytes; + int inputBatchStride = batchStride; + int outputBatchStride = batchStride; + PackInfo pack; + pack.inside = srcArea; + pack.axis = srcChannel; + pack.unit = PACK_NUMBER; + pack.outside = srcBatch; + if (mSingleConvert == 1) { + pack.axisStride = srcArea; + pack.insideStride = 1; + } else if (mSingleConvert == 2) { + pack.axisStride = 1; + pack.insideStride = srcChannel; + } + auto srcPtr = (void*)realInput->deviceId(); + auto dstPtr = (void*)output->deviceId(); + if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) { + if (realInput->dimensions() <= 1) { + cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice); + return NO_ERROR; + } + UnpackBuffer(dstPtr, srcPtr, &pack, bytes, runtime); + } else { + if (output->dimensions() <= 1) { + cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice); + return NO_ERROR; + } + PackBuffer(dstPtr, srcPtr, &pack, bytes, runtime); + } + return NO_ERROR; + } + if (mNeedZero) { + auto size = static_cast(backend())->realSize(mOutputPtr) * bytes; + cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size); + } + for (auto& iter : mTempInput) { + backend()->onCopyBuffer(iter.first, iter.second); } if(mFuseRaster.first) { MNN_ASSERT(mTempInputCopy.size() == 1); auto& iter = mTempInputCopy[0]; auto& slice = *(iter.second); - auto srcPtr = (uint8_t*)iter.first; - auto dstPtr = (uint8_t*)output->deviceId(); + auto srcPtr = (uint8_t*)iter.first->deviceId(); + auto dstPtr = (uint8_t*)mOutputPtr->deviceId(); //printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset); FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime); - return NO_ERROR; + } else { + for (auto& iter : mTempInputCopy) { + auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second->src.offset * bytes; + auto dstPtr = (uint8_t*)mOutputPtr->deviceId() + iter.second->dst.offset * bytes; + RasterBlit(dstPtr, srcPtr, iter.second->size, iter.second->src.stride, iter.second->dst.stride, bytes, runtime); + } } - for (int u = 0; u < mTempInputCopy.size(); ++u) { - auto& iter = mTempInputCopy[u]; - auto& slice = *(iter.second); - auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes; - auto dstPtr = (uint8_t*)output->deviceId() + slice.dst.offset * bytes; - RasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, bytes, runtime); + + if (nullptr != mTempOutput) { + backend()->onCopyBuffer(mTempOutput.get(), output); } return NO_ERROR; } -RasterExecution::RasterExecution(Backend* backend) : Execution(backend) { - // Do nothing -} -RasterExecution::~RasterExecution() { - // Do nothing -} -class RasterCreator : public CUDABackend::Creator { +class RasterExecutionFactory : public CUDABackend::Creator { public: virtual Execution* onCreate(const std::vector& inputs, const std::vector& outputs, - const MNN::Op* op, Backend* backend) const override { + const MNN::Op* op, Backend* backend) const { return new RasterExecution(backend); } }; -static CUDACreatorRegister __init(OpType_Raster); -} // namespace CUDA -} // namespace MNN \ No newline at end of file +static CUDACreatorRegister __init(OpType_Raster); + +} +} \ No newline at end of file diff --git a/source/backend/cuda/execution/RasterExecution.hpp b/source/backend/cuda/execution/RasterExecution.hpp index 5ef27c49..ed464b40 100644 --- a/source/backend/cuda/execution/RasterExecution.hpp +++ b/source/backend/cuda/execution/RasterExecution.hpp @@ -2,37 +2,43 @@ // RasterExecution.hpp // MNN // -// Created by MNN on 2020/07/30. +// Created by MNN on b'2020/04/02'. // Copyright © 2018, Alibaba Group Holding Limited // - #ifndef RasterExecution_hpp #define RasterExecution_hpp -#include -#include -#include #include "backend/cuda/core/CUDABackend.hpp" -#include "core/Execution.hpp" +#include +#include #include "core/TensorUtils.hpp" - namespace MNN { namespace CUDA { class RasterExecution : public Execution { public: - RasterExecution(Backend *backend); - virtual ~RasterExecution(); + RasterExecution(Backend* bn) : Execution(bn) { + // Do nothing + } + virtual ~ RasterExecution() { + // Do nothing + } + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - + void executeFaster(const std::vector &inputs, const std::vector &outputs) const; private: - std::vector> mTempInputCopy; + std::map mTempInput; + std::vector> mTempInputCopy; + std::vector> mFastBlit; + std::shared_ptr mTempOutput; + Tensor* mOutputPtr; bool mNeedZero = false; + bool mFast = false; + int mSingleConvert = 0; + int32_t mZeroPoint = 0; std::pair mFuseRaster; - void *mOffset; std::shared_ptr offsetTensor; }; -} // namespace CUDA -} // namespace MNN - +} +} #endif diff --git a/source/backend/cuda/execution/ReductionExecution.cu b/source/backend/cuda/execution/ReductionExecution.cu index 6d895a70..75ffb0fa 100755 --- a/source/backend/cuda/execution/ReductionExecution.cu +++ b/source/backend/cuda/execution/ReductionExecution.cu @@ -1,99 +1,19 @@ #include "ReductionExecution.hpp" - namespace MNN { namespace CUDA { ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) { mType = opType; mAxis = axis; + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mParam = staticPool->alloc(sizeof(ReduceParam)); } ReductionExecution::~ ReductionExecution() { - // Do nothing + auto staticPool = static_cast(backend())->getStaticBufferPool(); + staticPool->free(mParam); } -template -__global__ void SUM(const T *input, T *output, int inside, int axis, int outside) { - int count = inside * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - int y = i / inside; - int x = i % inside; - T sumValue = (T)0; - const T* basicInput = input + y * axis * inside + x; - for (int v=0; v -__global__ void MEAN(const T *input, T *output, int inside, int axis, int outside) { - int count = inside * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - int y = i / inside; - int x = i % inside; - T sumValue = (T)0; - const T* basicInput = input + y * axis * inside + x; - for (int v=0; v -__global__ void MINIMUM(const T *input, T *output, int inside, int axis, int outside) { - int count = inside * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - int y = i / inside; - int x = i % inside; - const T* basicInput = input + y * axis * inside + x; - T res = basicInput[0]; - for (int v=1; v -__global__ void MAXIMUM(const T *input, T *output, int inside, int axis, int outside) { - int count = inside * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - int y = i / inside; - int x = i % inside; - const T* basicInput = input + y * axis * inside + x; - T res = basicInput[0]; - for (int v=1; v -__global__ void PROD(const T *input, T *output, int inside, int axis, int outside) { - int count = inside * outside; - for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { - int y = i / inside; - int x = i % inside; - const T* basicInput = input + y * axis * inside + x; - T res = basicInput[0]; - for (int v=1; v &inputs, const std::vector &outputs) { - auto input = (void*)inputs[0]->deviceId(); - auto output = (void*)outputs[0]->deviceId(); +ErrorCode ReductionExecution::onResize(const std::vector &inputs, const std::vector &outputs) { auto runtime = static_cast(backend())->getCUDARuntime(); int inside = 1; int outside = 1; @@ -104,52 +24,88 @@ ErrorCode ReductionExecution::onExecute(const std::vector &inputs, con for (int i=mAxis+1; idimensions(); ++i) { inside *= inputs[0]->length(i); } + mCpuParam.inside = inside; + mCpuParam.outside = outside; + mCpuParam.axis = axis; + cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice)); + + return NO_ERROR; +} + +ErrorCode ReductionExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { + auto input = (void*)inputs[0]->deviceId(); + auto output = (void*)outputs[0]->deviceId(); + auto runtime = static_cast(backend())->getCUDARuntime(); + int inside = mCpuParam.inside;; + int outside = mCpuParam.outside; int count = inside * outside; int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); + auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second); if (inputs[0]->getType() == halide_type_of()) { - switch (mType) { - case ReductionType_MEAN: - MEAN<<>>((const float*)input, (float*)output, inside, axis, outside); - return NO_ERROR; - case ReductionType_SUM: - SUM<<>>((const float*)input, (float*)output, inside, axis, outside); - return NO_ERROR; - case ReductionType_MINIMUM: - MINIMUM<<>>((const float*)input, (float*)output, inside, axis, outside); - return NO_ERROR; - case ReductionType_MAXIMUM: - MAXIMUM<<>>((const float*)input, (float*)output, inside, axis, outside); - return NO_ERROR; - case ReductionType_PROD: - PROD<<>>((const float*)input, (float*)output, inside, axis, outside); - return NO_ERROR; + if (static_cast(backend())->useFp16()) { + switch (mType) { + case ReductionType_MEAN: + MEAN<<>>((const half*)input, (half*)output, param); + return NO_ERROR; + case ReductionType_SUM: + SUM<<>>((const half*)input, (half*)output, param); + return NO_ERROR; + case ReductionType_MINIMUM: + MINIMUM<<>>((const half*)input, (half*)output, param); + return NO_ERROR; + case ReductionType_MAXIMUM: + MAXIMUM<<>>((const half*)input, (half*)output, param); + return NO_ERROR; + case ReductionType_PROD: + PROD<<>>((const half*)input, (half*)output, param); + return NO_ERROR; + } + } else { + switch (mType) { + case ReductionType_MEAN: + MEAN<<>>((const float*)input, (float*)output, param); + return NO_ERROR; + case ReductionType_SUM: + SUM<<>>((const float*)input, (float*)output, param); + return NO_ERROR; + case ReductionType_MINIMUM: + MINIMUM<<>>((const float*)input, (float*)output, param); + return NO_ERROR; + case ReductionType_MAXIMUM: + MAXIMUM<<>>((const float*)input, (float*)output, param); + return NO_ERROR; + case ReductionType_PROD: + PROD<<>>((const float*)input, (float*)output, param); + return NO_ERROR; + } } MNN_ASSERT(false); return NOT_SUPPORT; } + MNN_ASSERT(inputs[0]->getType() == halide_type_of()); switch (mType) { case ReductionType_MEAN: - MEAN<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + MEAN<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_SUM: - SUM<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + SUM<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_MINIMUM: - MINIMUM<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + MINIMUM<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_MAXIMUM: - MAXIMUM<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + MAXIMUM<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_PROD: - PROD<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + PROD<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_ANY: - MAXIMUM<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + MAXIMUM<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; case ReductionType_ALL: - MINIMUM<<>>((const int32_t*)input, (int32_t*)output, inside, axis, outside); + MINIMUM<<>>((const int32_t*)input, (int32_t*)output, param); return NO_ERROR; } MNN_ASSERT(false); diff --git a/source/backend/cuda/execution/ReductionExecution.hpp b/source/backend/cuda/execution/ReductionExecution.hpp index a9699de7..a281e9ee 100644 --- a/source/backend/cuda/execution/ReductionExecution.hpp +++ b/source/backend/cuda/execution/ReductionExecution.hpp @@ -11,6 +11,7 @@ #include #include "backend/cuda/core/CUDABackend.hpp" #include "core/Execution.hpp" +#include "ReductionTemplate.cuh" namespace MNN { namespace CUDA { class ReductionExecution : public Execution { @@ -18,10 +19,13 @@ public: ReductionExecution(ReductionType opType, int axis, Backend *backend); virtual ~ReductionExecution(); virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; + virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; private: ReductionType mType; int mAxis; + ReduceParam mCpuParam; + std::pair mParam; }; } // namespace CUDA } // namespace MNN diff --git a/source/backend/cuda/execution/ReductionTemplate.cuh b/source/backend/cuda/execution/ReductionTemplate.cuh new file mode 100644 index 00000000..3586e83c --- /dev/null +++ b/source/backend/cuda/execution/ReductionTemplate.cuh @@ -0,0 +1,93 @@ +#ifndef ReductionTemplate_cuh +#define ReductionTemplate_cuh +struct ReduceParam { + int inside; + int axis; + int outside; +}; +template +__global__ void SUM(const T *input, T *output, const ReduceParam* param) { + int count = param->inside * param->outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / param->inside; + int x = i % param->inside; + float sumValue = 0.0; + int axis = param->axis; + const T* basicInput = input + y * param->axis * param->inside + x; + for (int v=0; vinside]; + } + output[y * param->inside + x] = (T)sumValue; + } + return; +} + +template +__global__ void MEAN(const T *input, T *output, const ReduceParam* param) { + int count = param->inside * param->outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / param->inside; + int x = i % param->inside; + float sumValue = 0.0; + int axis = param->axis; + const T* basicInput = input + y * param->axis * param->inside + x; + for (int v=0; vinside]; + } + output[y * param->inside + x] = (T)(sumValue / (float)param->axis); + } + return; +} + +template +__global__ void MINIMUM(const T *input, T *output, const ReduceParam* param) { + int count = param->inside * param->outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / param->inside; + int x = i % param->inside; + int axis = param->axis; + const T* basicInput = input + y * param->axis * param->inside + x; + float res = (float)basicInput[0]; + for (int v=1; vinside], res); + } + output[y * param->inside + x] = (T)res; + } + return; +} + +template +__global__ void MAXIMUM(const T *input, T *output, const ReduceParam* param) { + int count = param->inside * param->outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / param->inside; + int x = i % param->inside; + const T* basicInput = input + y * param->axis * param->inside + x; + int axis = param->axis; + float res = (float)basicInput[0]; + for (int v=1; vinside], res); + } + output[y * param->inside + x] = (T)res; + } + return; +} + +template +__global__ void PROD(const T *input, T *output, const ReduceParam* param) { + int count = param->inside * param->outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / param->inside; + int x = i % param->inside; + int axis = param->axis; + float sumValue = 1.0; + const T* basicInput = input + y * param->axis * param->inside + x; + for (int v=0; vinside]; + } + output[y * param->inside + x] = (T)sumValue; + } + return; +} + +#endif \ No newline at end of file diff --git a/source/backend/cuda/execution/ScaleExecution.cu b/source/backend/cuda/execution/ScaleExecution.cu index e90ba55d..a0eb25a0 100644 --- a/source/backend/cuda/execution/ScaleExecution.cu +++ b/source/backend/cuda/execution/ScaleExecution.cu @@ -1,4 +1,5 @@ #include "ScaleExecution.hpp" +#include "MNNCUDADefine.hpp" namespace MNN { namespace CUDA { @@ -6,61 +7,50 @@ namespace CUDA { template __global__ void SCALE(const int n, const int channels, const int dim, const T* in, T* out, - const T* scaleData, const T* biasData) { - CUDA_KERNEL_LOOP(index, n) { - int c = (index / dim) % channels; - out[index] = in[index] * scaleData[c] + biasData[c]; + const float* scaleData, const float* biasData) { + CUDA_KERNEL_LOOP(count, n) { + int index = count / PACK_NUMBER; + int r = count % PACK_NUMBER; + int c = (index / dim) * PACK_NUMBER + r; + out[count] = (T)((float)in[count] * scaleData[c] + biasData[c]); } } ScaleExecution::ScaleExecution(const Scale* scale, Backend *backend) : Execution(backend) { - mChannel = scale->scaleData()->size(); - - scaleTensor.reset(Tensor::createDevice({mChannel})); - backend->onAcquireBuffer(scaleTensor.get(), Backend::STATIC); - mDeviceScale = (void *)scaleTensor.get()->buffer().device; - - biasTensor.reset(Tensor::createDevice({mChannel})); - backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC); - mDeviceBias = (void *)biasTensor.get()->buffer().device; - - MNN_ASSERT(nullptr != mDeviceScale); - MNN_ASSERT(nullptr != mDeviceBias); + int channel = scale->scaleData()->size(); + mChannel = UP_DIV(channel, PACK_NUMBER); + auto scaleBiasStorageSize = 2 * mChannel * PACK_NUMBER * sizeof(float); + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mScaleBiasStorage = staticPool->alloc(scaleBiasStorageSize); + mDeviceScale = (uint8_t*)mScaleBiasStorage.first + mScaleBiasStorage.second; + mDeviceBias = (uint8_t*)mDeviceScale + scaleBiasStorageSize / 2; + cudaMemset(mDeviceScale, 0, scaleBiasStorageSize); { auto alphaData = scale->scaleData()->data(); - cudaMemcpy(mDeviceScale, alphaData, mChannel * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(mDeviceScale, alphaData, channel * sizeof(float), cudaMemcpyHostToDevice); } { auto biasData = scale->biasData()->data(); if (nullptr != biasData) { - MNN_ASSERT(mChannel == scale->biasData()->size()); - cudaMemcpy(mDeviceBias, biasData, mChannel * sizeof(float), cudaMemcpyHostToDevice); - } else { - cudaMemset(mDeviceBias, 0, mChannel * sizeof(float)); + cudaMemcpy(mDeviceBias, biasData, channel * sizeof(float), cudaMemcpyHostToDevice); } } } ScaleExecution::~ScaleExecution() { - if (nullptr != scaleTensor) { - backend()->onReleaseBuffer(scaleTensor.get(), Backend::STATIC); - } - if (nullptr != biasTensor) { - backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC); - } + auto staticPool = static_cast(backend())->getStaticBufferPool(); + staticPool->free(mScaleBiasStorage); } ErrorCode ScaleExecution::onResize(const std::vector &inputs, const std::vector &outputs) { MNN_ASSERT(inputs.size() == 1); MNN_ASSERT(outputs.size() == 1); auto input = inputs[0]; - mBatch = input->length(0); - MNN_ASSERT(mChannel == input->length(1)); MNN_ASSERT(input->dimensions() >= 2); - mArea = 1; + mArea = input->length(0); for (int i = 2; i < input->dimensions(); ++i) { mArea *= input->length(i); } - mCount = mBatch*mChannel*mArea; + mCount = mChannel*mArea*PACK_NUMBER; //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount); return NO_ERROR; } @@ -72,9 +62,13 @@ ErrorCode ScaleExecution::onExecute(const std::vector &inputs, const s int threads_num = runtime->threads_num(); auto input_addr = (void*)inputs[0]->deviceId(); auto output_addr = (void*)outputs[0]->deviceId(); - + if (static_cast(backend())->useFp16()) { + SCALE<<>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr, + (const float *)mDeviceScale, (const float *)mDeviceBias); + return NO_ERROR; + } SCALE<<>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr, - (const float *)mDeviceScale, (const float *)mDeviceBias); + (const float *)mDeviceScale, (const float *)mDeviceBias); return NO_ERROR; } diff --git a/source/backend/cuda/execution/ScaleExecution.hpp b/source/backend/cuda/execution/ScaleExecution.hpp index 6b0168b2..f9bd829a 100644 --- a/source/backend/cuda/execution/ScaleExecution.hpp +++ b/source/backend/cuda/execution/ScaleExecution.hpp @@ -30,13 +30,9 @@ private: void *mDeviceBias = nullptr; void *mDeviceScale = nullptr; int mCount; - int mBatch; int mChannel; int mArea; - - std::shared_ptr scaleTensor; - std::shared_ptr biasTensor; - + std::pair mScaleBiasStorage; }; } // namespace CUDA diff --git a/source/backend/cuda/execution/SelectExecution.cu b/source/backend/cuda/execution/SelectExecution.cu index daa03687..6e5e47e5 100644 --- a/source/backend/cuda/execution/SelectExecution.cu +++ b/source/backend/cuda/execution/SelectExecution.cu @@ -41,8 +41,11 @@ ErrorCode SelectExecution::onExecute(const std::vector& inputs, const s auto count = CUDABackend::realSize(inputs[0]); int block_num = runtime->blocks_num(count); int threads_num = runtime->threads_num(); - SELECT<<>>(count, (const int*)(inputs[0]->deviceId()), (const int*)(inputs[1]->deviceId()), (const int*)(inputs[2]->deviceId()), (int*)outputs[0]->deviceId()); - + if (static_cast(backend())->useFp16()) { + SELECT<<>>(count, (const int*)(inputs[0]->deviceId()), (const half*)(inputs[1]->deviceId()), (const half*)(inputs[2]->deviceId()), (half*)outputs[0]->deviceId()); + } else { + SELECT<<>>(count, (const int*)(inputs[0]->deviceId()), (const float*)(inputs[1]->deviceId()), (const float*)(inputs[2]->deviceId()), (float*)outputs[0]->deviceId()); + } #ifdef LOG_VERBOSE MNN_PRINT("end SelectExecution onExecute..."); #endif diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu index b57957ac..e55149ef 100644 --- a/source/backend/cuda/execution/SoftmaxExecution.cu +++ b/source/backend/cuda/execution/SoftmaxExecution.cu @@ -1,44 +1,120 @@ #include "SoftmaxExecution.hpp" - +#include "core/TensorUtils.hpp" namespace MNN { namespace CUDA { +template +__global__ void SOFTMAX(const T *input, T *output, const ReduceParam* param) { + int inside = param->inside; + int axis = param->axis; + int outside = param->outside; + int count = inside * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int y = i / inside; + int x = i % inside; + const T* src = input + y * axis * inside + x; + T* dst = output + y * axis * inside + x; + float maxValue = (float)src[0]; + for (int z=1; z +__global__ void EXPSUB(const T *input, const T* maxV, T *output, const ReduceParam* param) { + int inside = param->inside; + int axis = param->axis; + int outside = param->outside; + int count = inside * axis * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int tmp = i / inside; + int x = i % inside; + int y = tmp / axis; + int c = tmp % axis; + float sumValue = 0.0; + const float basicInput = input[i]; + const float maxValue = maxV[x + y * inside]; + output[i] = (T)(exp(basicInput - maxValue)); + } + return; +} + +template +__global__ void DIVSUM(const T *input, const T* maxV, T *output, const ReduceParam* param) { + int inside = param->inside; + int axis = param->axis; + int outside = param->outside; + int count = inside * axis * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + int tmp = i / inside; + int x = i % inside; + int y = tmp / axis; + int c = tmp % axis; + float sumValue = 0.0; + const float basicInput = input[i]; + const float value = maxV[x + y * inside]; + output[i] = (T)(basicInput / value); + } + return; +} SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) { - auto runtime = static_cast(backend)->getCUDARuntime(); - cudnn_handle_ = runtime->cudnn_handle(); - - cudnn_check(cudnnCreateTensorDescriptor(&input_desc_)); - cudnn_check(cudnnCreateTensorDescriptor(&output_desc_)); - - cudnn_data_type_ = CUDNN_DATA_FLOAT; mAxis = axis; + auto staticPool = static_cast(backend)->getStaticBufferPool(); + mParam = staticPool->alloc(sizeof(ReduceParam)); } SoftmaxExecution::~SoftmaxExecution() { - cudnnDestroyTensorDescriptor(input_desc_); - cudnnDestroyTensorDescriptor(output_desc_); + auto staticPool = static_cast(backend())->getStaticBufferPool(); + staticPool->free(mParam); } ErrorCode SoftmaxExecution::onResize(const std::vector &inputs, const std::vector &outputs) { - inside = 1; - outside = 1; - if(mAxis < 0) { - mAxis += inputs[0]->dimensions(); - } - axis = inputs[0]->length(mAxis); - for (int i=0; ilength(i); - } - for (int i=mAxis+1; idimensions(); ++i) { - inside *= inputs[0]->length(i); + auto input = inputs[0]; + const int dimensions = input->buffer().dimensions; + int axis = mAxis; + if (axis < 0) { + axis += dimensions; } - std::vector tensor_shape = {outside, axis, inside, 1}; - cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0], - tensor_shape[1], tensor_shape[2], tensor_shape[3])); + const auto layout = TensorUtils::getDescribe(input)->dimensionFormat; + mNeedUnpackC4 = layout == MNN_DATA_FORMAT_NC4HW4; + if (mNeedUnpackC4) { + for (int i=0; i < dimensions; ++i) { + mStorage.buffer().dim[i].extent = input->length(i); + } + TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NCHW; + mStorage.buffer().dimensions = dimensions; + mStorage.buffer().type = input->getType(); + backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC); + } - cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0], - tensor_shape[1], tensor_shape[2], tensor_shape[3])); + int inside = 1; + int outside = 1; + int dims = input->buffer().dimensions; + for (int i = 0; i < axis; ++i) { + outside *= input->length(i); + } + for (int i = axis + 1; i < dims; ++i) { + inside *= input->length(i); + } + + if (mNeedUnpackC4) { + backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC); + } + + mCpuParam.inside = inside; + mCpuParam.outside = outside; + mCpuParam.axis = input->length(axis); + cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice)); return NO_ERROR; } @@ -46,15 +122,28 @@ ErrorCode SoftmaxExecution::onResize(const std::vector &inputs, const ErrorCode SoftmaxExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { auto input = (void*)inputs[0]->deviceId(); auto output = (void*)outputs[0]->deviceId(); - - const float alpha = 1; - const float beta = 0; - cudnn_check(cudnnSoftmaxForward(cudnn_handle_, CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - input_desc_, input, - &beta, - output_desc_, output)); + auto dst = output; + auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second); + if (mNeedUnpackC4) { + backend()->onCopyBuffer(inputs[0], &mStorage); + input = (void*)mStorage.deviceId(); + dst = (void*)mStorage.deviceId(); + } + auto runtime = static_cast(backend())->getCUDARuntime(); + int inside = mCpuParam.inside; + int outside = mCpuParam.outside; + int axis = mCpuParam.axis; + int count = inside * outside; + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + if (static_cast(backend())->useFp16()) { + SOFTMAX<<>>((const half*)input, (half*)dst, param); + } else { + SOFTMAX<<>>((const float*)input, (float*)dst, param); + } + if (mNeedUnpackC4) { + backend()->onCopyBuffer(&mStorage, outputs[0]); + } return NO_ERROR; } diff --git a/source/backend/cuda/execution/SoftmaxExecution.hpp b/source/backend/cuda/execution/SoftmaxExecution.hpp index df0661d7..40876d44 100644 --- a/source/backend/cuda/execution/SoftmaxExecution.hpp +++ b/source/backend/cuda/execution/SoftmaxExecution.hpp @@ -9,11 +9,9 @@ #ifndef SoftmaxExecution_hpp #define SoftmaxExecution_hpp -#include "core/Execution.hpp" - #include +#include "ReductionTemplate.cuh" #include "backend/cuda/core/CUDABackend.hpp" - namespace MNN { namespace CUDA { @@ -26,15 +24,11 @@ public: virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; private: - cudnnHandle_t cudnn_handle_; - cudnnTensorDescriptor_t input_desc_; - cudnnTensorDescriptor_t output_desc_; - cudnnDataType_t cudnn_data_type_; - int mAxis; - int axis; - int inside; - int outside; + Tensor mStorage; + bool mNeedUnpackC4; + ReduceParam mCpuParam; + std::pair mParam; }; } // namespace CUDA diff --git a/source/backend/cuda/execution/TensorCoreGemm.cu b/source/backend/cuda/execution/TensorCoreGemm.cu index 4974e3a4..4b167670 100644 --- a/source/backend/cuda/execution/TensorCoreGemm.cu +++ b/source/backend/cuda/execution/TensorCoreGemm.cu @@ -3,6 +3,7 @@ #include #include #include "TensorCoreGemm.cuh" +#include "MNNCUDAFunction.cuh" #define BLOCK_ROW_WARPS 2 #define BLOCK_COL_WARPS 4 @@ -12,127 +13,237 @@ #define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS) #define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS) -#define CHUNK_K 4 +#define CHUNK_L 4 +#define CHUNK_E 4 +#define CHUNK_H 4 +#define PACK_NUMBER 16 +#define PACK_NUMBER_C2 (PACK_NUMBER/2) using namespace nvcuda; namespace MNN { namespace CUDA { -__global__ void GemmPrearrange(const MatMulParam* param, - const float* A, - __half* AP, - const float* B, - __half* BP +template +__global__ void GemmPrearrange(MatMulParam paramV, + const T* OA, + __half* OAP, + const T* OB, + __half* OBP, + DivModFast lA ) { - int i = blockIdx.x * blockDim.x + threadIdx.x; + int b = blockIdx.x; + auto param = ¶mV; + int lAlign = param->elhPack[1] * 16; + int eAlign = param->elhPack[0] * 16; + int hAlign = param->elhPack[2] * 16; + __half* BP = OBP + b * param->elhPack[1] * param->elhPack[2] * 16 * 16; + __half* AP = OAP + b * param->elhPack[1] * param->elhPack[0] * 16 * 16; + const T* A = OA + b * param->elh[0] * param->elh[1]; + const T* B = OB + b * param->elh[2] * param->elh[1]; + int mc = param->elhPack[0] * param->elhPack[1] * 256; int e = param->elh[0]; int l = param->elh[1]; int h = param->elh[2]; - int lIndex = i % l; - int oIndex = i / l; - int lU = lIndex / 16; - int lR = lIndex % 16; - int eU = oIndex / 16; - int eR = oIndex % 16; + for (size_t index = threadIdx.x; index < mc && OA != nullptr; index += blockDim.x) { + int lIndex, oIndex; + lA.divmod(index, oIndex, lIndex); - if (i < e * l) { - float value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]]; - __half* dst = AP + eU * param->elhPack[1] * 16 * 16 + lU * 16 * 16 + lR + eR * 16; - dst[0] = value; + half value = 0.0; + if (oIndex < e && lIndex < l) { + value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]]; + } + AP[index] = value; } - if (i < h * l) { - float value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]]; - int hU = eU; - int hR = eR; - __half* dst = BP + hU * param->elhPack[1] * 16 * 16 + lU * 16 * 16 + lR + hR * 16; - dst[0] = value; + mc = param->elhPack[2] * param->elhPack[1] * 256; + for (size_t index = threadIdx.x; index < mc && OB != nullptr; index += blockDim.x) { + int lIndex, oIndex; + lA.divmod(index, oIndex, lIndex); + half value = 0.0; + if (oIndex < h && lIndex < l) { + value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]]; + } + BP[index] = value; } } -void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const float* A, __half* AP, const float* B, __half* BP) { - int maxCount = std::max(cpuParam->elh[0] * cpuParam->elh[1], cpuParam->elh[1] * cpuParam->elh[2]); +template +__global__ void GemmPrearrange_OPT(MatMulParam paramV, const int maxCount, + const int AreaPackA, const int AreaPackB, const int AreaA, const int AreaB, + const T* OA, + __half* OAP, + const T* OB, + __half* OBP, + DivModFast lA, + DivModFast pM + ) { + int index, b; + size_t indexT = blockIdx.x*blockDim.x+threadIdx.x; + pM.divmod(indexT, b, index); + int indexCopy = index; + + auto param = ¶mV; + int e = param->elh[0]; + int l = param->elh[1]; + int h = param->elh[2]; + for (; index < AreaPackA && OA != nullptr; index += blockDim.x*gridDim.x) { + int lIndex, oIndex; + lA.divmod(index, oIndex, lIndex); + + __half* AP = OAP + b * AreaPackA; + const T* A = OA + b * AreaA; + half value = 0.0; + if (oIndex < e && lIndex < l) { + value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]]; + } + AP[index] = value; + } + + index = indexCopy; + for (; index < AreaPackB && OB != nullptr; index += blockDim.x*gridDim.x) { + int lIndex, oIndex; + lA.divmod(index, oIndex, lIndex); + + __half* BP = OBP + b * AreaPackB; + const T* B = OB + b * AreaB; + half value = 0.0; + if (oIndex < h && lIndex < l) { + value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]]; + } + BP[index] = value; + } +} + +void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const void* A, __half* AP, const void* B, __half* BP, int bytes) { + auto& prop = runtime->prop(); + int threads_num = prop.maxThreadsPerBlock; + int unit_threads_num = ALIMAX(cpuParam->elhPack[0], cpuParam->elhPack[2]) * cpuParam->elhPack[1] * 256; + threads_num = ALIMIN(threads_num, unit_threads_num); + + const int AreaPackA = cpuParam->elhPack[0] * cpuParam->elhPack[1] * 256; + const int AreaPackB = cpuParam->elhPack[1] * cpuParam->elhPack[2] * 256; + const int AreaA = cpuParam->elh[0] * cpuParam->elh[1]; + const int AreaB = cpuParam->elh[1] * cpuParam->elh[2]; + + const int maxPack = ALIMAX(AreaPackA, AreaPackB); + const int maxCount = cpuParam->batch * maxPack; + DivModFast pM(maxPack); int block_num = runtime->blocks_num(maxCount); - int threads_num = runtime->threads_num(); - if (nullptr != AP) { - runtime->memset(AP, 0, cpuParam->elhPack[0] * cpuParam->elhPack[1] * 256 * sizeof(__half)); + int block_size = runtime->threads_num(); + DivModFast lA(cpuParam->elhPack[1] * 16); + if (bytes == 4) { + //GemmPrearrange<<batch, threads_num>>>(*cpuParam, (float*)A, AP, (float*)B, BP, lA); + GemmPrearrange_OPT<<>>(*cpuParam, maxCount, AreaPackA, AreaPackB, AreaA, AreaB, (float*)A, AP, (float*)B, BP, lA, pM); + checkKernelErrors; + } else { + MNN_ASSERT(bytes == 2); + //GemmPrearrange<<batch, threads_num>>>(*cpuParam, (half*)A, AP, (half*)B, BP, lA); + GemmPrearrange_OPT<<>>(*cpuParam, maxCount, AreaPackA, AreaPackB, AreaA, AreaB, (half*)A, AP, (half*)B, BP, lA, pM); + checkKernelErrors; } - if (nullptr != BP) { - runtime->memset(BP, 0, cpuParam->elhPack[2] * cpuParam->elhPack[1] * 256 * sizeof(__half)); - } - GemmPrearrange<<>>(param, A, AP, B, BP); } -__global__ void GemmPacked(const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr) { +template +__global__ void GemmPacked(const MatMulParam* param, T *bc, const half *ba, const half *bb, const T* biasPtr) { int eU = param->elhPack[0]; int lU = param->elhPack[1]; int hU = param->elhPack[2]; - int maxCount = eU * hU * warpSize; + int maxCount = eU * hU * warpSize * param->batch; extern __shared__ float sharedMemory[]; for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { - int subIndex = index / warpSize; + int oIndex = index / warpSize; + int subIndex = oIndex % (eU * hU); + int bIndex = oIndex / (eU * hU); int wrapId = threadIdx.x / warpSize; int laneId = threadIdx.x % warpSize; int warpM = subIndex % eU; int warpN = subIndex / eU; + T* c = bc + bIndex * param->elh[0] * param->elh[2]; + const half* a = ba + bIndex * param->elhPack[1] * param->elhPack[0] * 16 * 16; + const half* b = bb + bIndex * param->elhPack[1] * param->elhPack[2] * 16 * 16; float* cache = sharedMemory + wrapId * 16 * 16; // Declare the fragments - wmma::fragment + wmma::fragment a_frag; - wmma::fragment + wmma::fragment b_frag; wmma::fragment acc_frag; wmma::fill_fragment(acc_frag, 0.0f); - const half* aStart = a + warpM * lU * 16 * 16; - const half* bStart = b + warpN * lU * 16 * 16; + const half* aStart = a + warpM * param->aPStride[0]; + const half* bStart = b + warpN * param->bPStride[0]; //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements); // MLA for (int i = 0; i < lU; ++i) { // Load the inputs - wmma::load_matrix_sync(a_frag, aStart + i * 256, 16); - wmma::load_matrix_sync(b_frag, bStart + i * 256, 16); + wmma::load_matrix_sync(a_frag, aStart + i * param->aPStride[1], param->aPStride[2]); + wmma::load_matrix_sync(b_frag, bStart + i * param->bPStride[1], param->bPStride[2]); // Perform the matrix multiplication wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); } wmma::store_matrix_sync(cache, acc_frag, 16, wmma::mem_row_major); - //wmma::store_matrix_sync(c + warpM * 16 * param->elh[2] + 16 * warpN, acc_frag, param->elh[2], wmma::mem_row_major); int eSta = warpM * 16; int eEnd = min(eSta + 16, param->elh[0]); int hSta = warpN * 16; int hEnd = min(hSta + 16, param->elh[2]); int eC = eEnd - eSta; int hC = hEnd - hSta; - float* dstStart = c + hSta * param->cStride[2]; + T* dstStart = c + hSta * param->cStride[2]; if (nullptr != biasPtr) { for (int tId = laneId; tId < eC * hC; tId += warpSize) { int y = tId % eC; int x = tId / eC; int ye = y + eSta; - int yi = ye % param->split[2]; - int yc = ye / param->split[2]; - dstStart[yc * param->cStride[0] + yi * param->cStride[1] + x * param->cStride[2]] = min(max(cache[16 * y + x] + biasPtr[hSta + x], param->minValue), param->maxValue); + float value = cache[16 * y + x]; + float biasValue = biasPtr[hSta + x]; + dstStart[ye * param->cStride[0] + x * param->cStride[2]] = value + biasValue; } } else { for (int tId = laneId; tId < eC * hC; tId += warpSize) { int y = tId % eC; int x = tId / eC; int ye = y + eSta; - int yi = ye % param->split[2]; - int yc = ye / param->split[2]; - dstStart[yc * param->cStride[0] + yi * param->cStride[1] + x * param->cStride[2]] = min(max(cache[16 * y + x], param->minValue), param->maxValue); + float value = cache[16 * y + x]; + dstStart[ye * param->cStride[0] + x * param->cStride[2]] = value; } } } } -void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr) { +void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const void* biasPtr, int bytes, bool transposeA, bool transposeB) { auto& prop = runtime->prop(); int threads_num = prop.maxThreadsPerBlock; int cores = prop.multiProcessorCount; int sharedMemorySize = 16 * 16 * sizeof(float) * threads_num / prop.warpSize; - cudaFuncSetAttribute(GemmPacked, cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemorySize); - GemmPacked<<>>(param, c, a, b, biasPtr); -} - + if (bytes == 4) { + if (transposeA) { + if (transposeB) { + GemmPacked<<>>(param, (float*)c, a, b, (float*)biasPtr); + } else { + GemmPacked<<>>(param, (float*)c, a, b, (float*)biasPtr); + } + } else { + if (transposeB) { + GemmPacked<<>>(param, (float*)c, a, b, (float*)biasPtr); + } else { + GemmPacked<<>>(param, (float*)c, a, b, (float*)biasPtr); + } + } + } else { + if (transposeA) { + if (transposeB) { + GemmPacked<<>>(param, (half*)c, a, b, (half*)biasPtr); + } else { + GemmPacked<<>>(param, (half*)c, a, b, (half*)biasPtr); + } + } else { + if (transposeB) { + GemmPacked<<>>(param, (half*)c, a, b, (half*)biasPtr); + } else { + GemmPacked<<>>(param, (half*)c, a, b, (half*)biasPtr); + } + } + } + checkKernelErrors; +} } } diff --git a/source/backend/cuda/execution/TensorCoreGemm.cuh b/source/backend/cuda/execution/TensorCoreGemm.cuh index bd690739..fe196b58 100644 --- a/source/backend/cuda/execution/TensorCoreGemm.cuh +++ b/source/backend/cuda/execution/TensorCoreGemm.cuh @@ -7,6 +7,7 @@ #include "backend/cuda/core/runtime/CUDARuntime.hpp" #include #define MATMULPACK 16 +#define MATMULPACK2 (MATMULPACK * MATMULPACK) namespace MNN { namespace CUDA { @@ -16,12 +17,20 @@ struct MatMulParam { int aStride[3]; int bStride[3]; int cStride[3]; - int split[3];// a, b, c can split e / h in l + + // Outside E, Outside L, Inside + int aPStride[3]; + + // Outside H, Outside L, Inside + int bPStride[3]; + + int batch = 1; float minValue = -FLT_MAX; float maxValue = FLT_MAX; }; -void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const float* A, __half* AP, const float* B, __half* BP); -void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr); +void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const void* A, __half* AP, const void* B, __half* BP, int bytes); +void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const void* biasPtr, int bytes, bool transposeA, bool transposeB); + } } #endif \ No newline at end of file diff --git a/source/backend/cuda/execution/TensorCoreGemmPacked.cu b/source/backend/cuda/execution/TensorCoreGemmPacked.cu new file mode 100644 index 00000000..c0342507 --- /dev/null +++ b/source/backend/cuda/execution/TensorCoreGemmPacked.cu @@ -0,0 +1,184 @@ + +#include +#include +#include +#include +#include "TensorCoreGemm.cuh" + +using namespace nvcuda; +namespace MNN { +namespace CUDA { + +template +__global__ void GemmPackedFull(const MatMulParam* param, T *c, const half *a, const half *b, const T* biasPtr) { + int eU = param->elhPack[0]; + int lU = param->elhPack[1]; + int hU = param->elhPack[2]; + int maxCount = eU * hU * warpSize; + int wrapId = threadIdx.x / warpSize; + int laneId = threadIdx.x % warpSize; + extern __shared__ float sharedMemory[]; + for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) { + size_t subIndex = index / warpSize; + size_t warpM = subIndex % eU; + size_t warpN = subIndex / eU; + T* cache = (T*)(sharedMemory + wrapId * 16 * 16); + // Declare the fragments + wmma::fragment + a_frag; + wmma::fragment + b_frag; + wmma::fragment acc_frag; + + wmma::load_matrix_sync(acc_frag, biasPtr + 16 * warpN, 0, wmma::mem_row_major); + const half* aStart = a + warpM * lU * 16 * 16; + const half* bStart = b + warpN * lU * 16 * 16; + //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements); + // MLA + for (int i = 0; i < lU; ++i) { + wmma::load_matrix_sync(a_frag, aStart + i * 256, 16); + wmma::load_matrix_sync(b_frag, bStart + i * 256, 16); + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + } + for(int t=0; tminValue); + acc_frag.x[t] = min(acc_frag.x[t], param->maxValue); + } + int eSta = warpM * 16; + int eEnd = min(eSta + 16, param->elh[0]); + int eC = eEnd - eSta; + T* dstStart = (T*)(c + warpN * 16 * param->elh[0] + eSta * 16); + wmma::store_matrix_sync(cache, acc_frag, 16, wmma::mem_row_major); + if (warpSize % 16 == 0) { + int r = warpSize / 16; + int x = laneId / r; + int ysta = laneId % r; + for (int y = ysta; y < eC; y+=r) { + float value = *((T*)(cache + 16 * y + x)); + dstStart[y * 16 + x] = value; + } + } else { + for (int tId = laneId; tId < eC * 16; tId += warpSize) { + int y = tId % eC; + int x = tId / eC; + float value = *((T*)(cache + 16 * y + x)); + dstStart[y * 16 + x] = value; + } + } + } +} + +template +__global__ void GemmPackedFull16x32(const MatMulParam* param, T *c, const half *a, const half *b, const T* biasPtr) { + size_t eU = param->elhPack[0]; + size_t lU = param->elhPack[1]; + size_t hU = param->elhPack[2]; + size_t threadCount = blockDim.x / warpSize; + size_t maxCount = eU * (hU / 2); + size_t wrapId = threadIdx.x / warpSize; + size_t laneId = threadIdx.x % warpSize; + extern __shared__ float sharedMemory[]; + T* cache = (T*)(sharedMemory + wrapId * 16 * 32); + for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) { + size_t warpM = index % eU; + size_t warpN = index / eU; + // Declare the fragments + wmma::fragment + MA0; + wmma::fragment + MB0; + wmma::fragment + MB1; + wmma::fragment MC0; + wmma::fragment MC1; + + wmma::load_matrix_sync(MC0, biasPtr + 32 * warpN + 0, 0, wmma::mem_row_major); + wmma::load_matrix_sync(MC1, biasPtr + 32 * warpN + 16, 0, wmma::mem_row_major); + const half* aStart = a + warpM * lU * 16 * 16; + const half* bStart = b + warpN * lU * 16 * 32; + //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements); + // MLA + for (int i = 0; i < lU; ++i) { + wmma::load_matrix_sync(MA0, aStart + i * 256 + 0, 16); + wmma::load_matrix_sync(MB0, bStart + i * 512, 16); + wmma::load_matrix_sync(MB1, bStart + i * 512 + 256, 16); + wmma::mma_sync(MC0, MA0, MB0, MC0); + wmma::mma_sync(MC1, MA0, MB1, MC1); + } + for(int t=0; tminValue); + MC0.x[t] = min(MC0.x[t], param->maxValue); + } + for(int t=0; tminValue); + MC1.x[t] = min(MC1.x[t], param->maxValue); + } + size_t eSta = warpM * 16; + size_t eEnd = ((eSta + (size_t)16) > (size_t)param->elh[0]) ? (size_t)param->elh[0] : (eSta + (size_t)16); + size_t eC = eEnd - eSta; + T* dst0 = (T*)(c + warpN * 32 * param->elh[0] + eSta * 16); + T* dst1 = (T*)(c + (warpN * 32 + 16) * param->elh[0] + eSta * 16); + // First 8x32 + wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major); + // Second 8x32 + wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major); + auto dst = dst0; + auto src = cache; + if (laneId >= 16) { + dst = dst1; + src = cache + 256; + } + int x = laneId % 16; + for (size_t y = 0; y < eC; ++y) { + dst[y * 16 + x] = src[y * 16 + x]; + } + } +} + +void GemmPackedFullMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + //MNN_PRINT("%d - %d - %d - %d\n", cpuParam->elhPack[0], cpuParam->elhPack[1], cpuParam->elhPack[2], cpuParam->elh[2]); + { + int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * cpuParam->elhPack[2], cores); + int threads_num = std::min(prop.maxThreadsPerBlock, maxThreadInWarp * prop.warpSize); + int basicMemory = 16 * 16 * sizeof(float) * prop.maxThreadsPerBlock / prop.warpSize; + if (4 == bytes) { + cudaFuncSetAttribute(GemmPackedFull, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor); + GemmPackedFull<<>>(param, (float*)c, a, b, (float*)biasPtr); + checkKernelErrors; + } else { + //MNN_PRINT("%d - %d, %d- %d\n", cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elh[0], cpuParam->elh[2]); + cudaFuncSetAttribute(GemmPackedFull, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor); + GemmPackedFull<<>>(param, (half*)c, a, b, (half*)biasPtr); + checkKernelErrors; + } + } +} + + +void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + // MNN_PRINT("%d - %d - %d\n", cpuParam->elhPack[0], cpuParam->elhPack[1], cpuParam->elhPack[2]); + { + int hUP = cpuParam->elhPack[2] / 2; + int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * hUP, cores); + int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize); + //MNN_PRINT("GemmPacked16x32:%d-%d-%d-%d-%d\n\n", hUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num); + threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num); + int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize); + if (4 == bytes) { + cudaFuncSetAttribute(GemmPackedFull16x32, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory); + GemmPackedFull16x32<<>>(param, (float*)c, a, b, (float*)biasPtr); + checkKernelErrors; + } else { + cudaFuncSetAttribute(GemmPackedFull16x32, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory); + GemmPackedFull16x32<<>>(param, (half*)c, a, b, (half*)biasPtr); + checkKernelErrors; + } + } +} + +} +} \ No newline at end of file diff --git a/source/backend/cuda/execution/TensorCoreGemmPacked.cuh b/source/backend/cuda/execution/TensorCoreGemmPacked.cuh new file mode 100644 index 00000000..637c3715 --- /dev/null +++ b/source/backend/cuda/execution/TensorCoreGemmPacked.cuh @@ -0,0 +1,8 @@ +#include "TensorCoreGemm.cuh" +namespace MNN { +namespace CUDA { + +void GemmPackedFullMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes); +void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes); +} +} \ No newline at end of file diff --git a/source/backend/cuda/execution/Transpose.cu b/source/backend/cuda/execution/Transpose.cu new file mode 100644 index 00000000..84930b40 --- /dev/null +++ b/source/backend/cuda/execution/Transpose.cu @@ -0,0 +1,291 @@ +// +// Transpose.cu +// MNN +// +// Created by MNN on b'2021/12/09'. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include "Transpose.cuh" +#include "core/Macro.h" +#include "MNNCUDADefine.hpp" +#include "MNNCUDAFunction.cuh" +namespace MNN { +namespace CUDA { + +template +__global__ void UNPACKCOMMON_4(const T0 *input, T1 *output, + const int total, int inside, int axis, int outside, + int insideStride, int axisStride, + DivModFast is, DivModFast os + ) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int tmpI = i >> 2; + int yR = i & 3; + int x, tmp, yC, z; + is.divmod(tmpI, tmp, x); + os.divmod(tmp, yC, z); + int y = (yC << 2) + yR; + int srcOffset = ((z * inside + yC * inside * outside + x) << 2) + yR; + int dstOffset = x * insideStride + y * axisStride + z * inside * axis; + if (y < axis) { + output[dstOffset] = input[srcOffset]; + } + } +} + +template +__global__ void UNPACKCOMMON(const T0 *input, T1 *output, + int inside, int axis, int outside, + int insideStride, int axisStride + ) { + int axisAlign = UP_DIV(axis, PACK_NUMBER) * PACK_NUMBER;; + int total = axisAlign * inside * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int tmpI = i >> 4; + int yR = i & 15; + int x = tmpI % inside; + int tmp = tmpI / inside; + int yC = tmp / outside; + int z = tmp % outside; + int y = yC * PACK_NUMBER + yR; + int srcOffset = PACK_NUMBER * (z * inside + yC * inside * outside + x) + yR; + int dstOffset = x * insideStride + y * axisStride + z * inside * axis; + if (y < axis) { + output[dstOffset] = input[srcOffset]; + } + } +} + +template +__global__ void PACKCOMMON_4(const T0 *input, T1 *output, + int inside, int axis, int outside, + int insideStride, int axisStride + ) { + int axisAlign = UP_DIV(axis, 4) * 4;; + int total = axisAlign * inside * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int tmpI = i >> 2; + int yR = i & 3; + int x = tmpI % inside; + int tmp = tmpI / inside; + int yC = tmp / outside; + int z = tmp % outside; + int y = yC * 4 + yR; + int dstOffset = 4 * (z * inside + yC * inside * outside + x) + yR; + int srcOffset = x * insideStride + y * axisStride + z * inside * axis; + if (y < axis) { + output[dstOffset] = input[srcOffset]; + } else { + output[dstOffset] = {0, 0, 0, 0}; + } + } +} +template +__global__ void PACKCOMMON(const T0 *input, T1 *output, + int inside, int axis, int outside, + int insideStride, int axisStride + ) { + int axisAlign = UP_DIV(axis, PACK_NUMBER) * PACK_NUMBER;; + int total = axisAlign * inside * outside; + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) { + int tmpI = i >> 4; + int yR = i & 15; + int x = tmpI % inside; + int tmp = tmpI / inside; + int yC = tmp / outside; + int z = tmp % outside; + int y = yC * PACK_NUMBER + yR; + int dstOffset = PACK_NUMBER * (z * inside + yC * inside * outside + x) + yR; + int srcOffset = x * insideStride + y * axisStride + z * inside * axis; + if (y < axis) { + output[dstOffset] = input[srcOffset]; + } else { + output[dstOffset] = 0.0; + } + } +} + +void PackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + if (info->axis % 4 == 0 && info->axisStride == 1 && \ + bytes == 4 && info->insideStride == info->axis) { + PACKCOMMON_4<<>>((const int4*)input, (int4*)output, + info->inside, info->axis / 4, info->outside, + info->insideStride / 4, info->axisStride); + return; + } + switch (bytes) { + case 4: + PACKCOMMON<<>>((const float*)input, (float*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + case 2: + PACKCOMMON<<>>((const half*)input, (half*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + case 1: + PACKCOMMON<<>>((const int8_t*)input, (int8_t*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + default: + break; + } +} +void UnpackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + + if (info->axis % 4 == 0 && info->axisStride == 1 && bytes == 4 && info->insideStride == info->axis) { + DivModFast is(info->inside); + DivModFast os(info->outside); + const int maxCount = info->inside * UP_DIV(info->axis / 4, 4) * 4 * info->outside; + int block_num = runtime->blocks_num(maxCount); + int block_size = runtime->threads_num(); + UNPACKCOMMON_4<<>>((const int4*)input, (int4*)output, + maxCount, info->inside, info->axis / 4, info->outside, + info->insideStride / 4, info->axisStride, is, os); + return; + } + switch (bytes) { + case 4: + UNPACKCOMMON<<>>((const float*)input, (float*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + case 2: + UNPACKCOMMON<<>>((const half*)input, (half*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + case 1: + UNPACKCOMMON<<>>((const int8_t*)input, (int8_t*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); + break; + default: + break; + } +} + +void PackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + PACKCOMMON<<>>((const float*)input, (half*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); +} +void PackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + PACKCOMMON<<>>((const half*)input, (float*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); +} + +void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + UNPACKCOMMON<<>>((const half*)input, (float*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); +} +void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) { + auto& prop = runtime->prop(); + int cores = prop.multiProcessorCount; + int threadNumbers = prop.maxThreadsPerBlock; + UNPACKCOMMON<<>>((const float*)input, (half*)output, + info->inside, info->axis, info->outside, + info->insideStride, info->axisStride); +} + + + +template +__global__ void TRANSPOSE(const T *input, T *output, const TransposeParam* param) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < param->total) { + int x = i % param->dims[0]; + int tmp = i / param->dims[0]; + int y = tmp % param->dims[1]; + int z = tmp / param->dims[1]; + int srcOffset = param->srcStride * z + y + x * param->dims[2]; + int dstOffset = param->dstStride * z + x + y * param->dims[3]; + output[dstOffset] = input[srcOffset]; + } +} +#define LOCAL_DIM 8 + +template +__global__ void TRANSPOSE_LOCAL(const T* input, T *output, const TransposeParam* param) { + __shared__ T localM[LOCAL_DIM][LOCAL_DIM + 1]; + int num = blockIdx.z; + for (int n = num; n < param->size; n += gridDim.z) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < param->dims[0] && y < param->dims[1]) { + int offset = n * param->srcStride + x * param->dims[2] + y; + localM[threadIdx.y][threadIdx.x] = input[offset]; + } + __syncthreads(); + x = blockIdx.y * blockDim.y + threadIdx.x; + y = blockIdx.x * blockDim.x + threadIdx.y; + if (x < param->dims[1] && y < param->dims[0]) { + int offset = n * param->dstStride + x * param->dims[3] + y; + output[offset] = localM[threadIdx.x][threadIdx.y]; + } + } +} + +void Transpose(uint8_t* output, const uint8_t* input, const TransposeParam* cpuParam, const TransposeParam* gpuRegion, int bytes, CUDARuntime* runtime) { + int count = cpuParam->total; + int block_num = runtime->blocks_num(count); + int threads_num = runtime->threads_num(); + auto out = output + bytes * cpuParam->dstOffset; + auto inp = input + bytes * cpuParam->srcOffset; + if (runtime->prop().maxThreadsPerBlock >= LOCAL_DIM * LOCAL_DIM && (cpuParam->dims[0] >= LOCAL_DIM || cpuParam->dims[1] >= LOCAL_DIM)) { + dim3 localSize(LOCAL_DIM, LOCAL_DIM, 1); + //printf("%d, %d - %d, %d - %d\n", cpuParam->size, cpuParam->dims[0], cpuParam->dims[1], cpuParam->dims[2], cpuParam->dims[3]); + int globalZ = ALIMIN(runtime->prop().multiProcessorCount, cpuParam->size); + dim3 globalSize(UP_DIV(cpuParam->dims[0], LOCAL_DIM), UP_DIV(cpuParam->dims[1], LOCAL_DIM), globalZ); + switch (bytes) { + case 4: + TRANSPOSE_LOCAL<<>>((const float *)inp, (float *)out, gpuRegion); + break; + case 2: + TRANSPOSE_LOCAL<<>>((const half *)inp, (half *)out, gpuRegion); + break; + case 1: + TRANSPOSE_LOCAL<<>>((const int8_t *)inp, (int8_t *)out, gpuRegion); + break; + default: + break; + } + return; + } + switch (bytes) { + case 4: + TRANSPOSE<<>>((int*)inp, (int*)out, gpuRegion); + break; + case 2: + TRANSPOSE<<>>((int16_t*)inp, (int16_t*)out, gpuRegion); + break; + case 1: + TRANSPOSE<<>>((int8_t*)inp, (int8_t*)out, gpuRegion); + break; + default: + break; + } +} + +}; +}; \ No newline at end of file diff --git a/source/backend/cuda/execution/Transpose.cuh b/source/backend/cuda/execution/Transpose.cuh new file mode 100644 index 00000000..480369e6 --- /dev/null +++ b/source/backend/cuda/execution/Transpose.cuh @@ -0,0 +1,44 @@ +// +// Transpose.cuh +// MNN +// +// Created by MNN on b'2021/12/09'. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef Transpose_cuh +#define Transpose_chu +#include "backend/cuda/core/runtime/CUDARuntime.hpp" +namespace MNN { +namespace CUDA { + +struct PackInfo { + int outside; + int inside; + int axis; + int unit; + int insideStride; + int axisStride; +}; +void UnpackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime); +void PackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime); +void PackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime); +void PackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime); +void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime); +void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime); + +struct TransposeParam { + int dims[4]; + int srcOffset; + int srcStride; + int dstOffset; + int dstStride; + int size; + int total; +}; +void Transpose(uint8_t* output, const uint8_t* input, const TransposeParam* cpuParam, const TransposeParam* gpuRegion, int bytes, CUDARuntime* runtime); + +} +} + +#endif diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu index 42e0ea5a..6d071d8b 100644 --- a/source/backend/cuda/execution/UnaryExecution.cu +++ b/source/backend/cuda/execution/UnaryExecution.cu @@ -21,7 +21,7 @@ void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtim { Tensor::InsideDescribe::Region reg; reg.size[2] = count; - UnaryBlit((uint8_t*)output, (const uint8_t*)input, reg.size, reg.src.stride, reg.dst.stride, 4, runtime, op_type); + UnaryBlit((uint8_t*)output, (const uint8_t*)input, reg.size, reg.src.stride, reg.dst.stride, data_type.bytes(), runtime, op_type); return; } @@ -41,6 +41,9 @@ ErrorCode UnaryExecution::onExecute(const std::vector& inputs, const st MNN_PRINT("start UnaryExecution onExecute..."); #endif auto type = inputs[0]->getType(); + if (static_cast(backend())->useFp16()) { + type.bits = 16; + } callUnary((void*)inputs[0]->deviceId(), (void*)outputs[0]->deviceId(), mCount, mRuntime, type, mOpType); #ifdef LOG_VERBOSE MNN_PRINT("end UnaryExecution onExecute..."); @@ -58,6 +61,15 @@ __global__ void RELU(const float *input, float *output, size_t count, float slop return; } +__global__ void RELU_Half(const half *input, half *output, size_t count, float slope) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + float x = input[i]; + float y = x > 0 ? x : x * slope; + output[i] = (half)y; + } + return; +} + class ReluExecution : public Execution { public: ReluExecution(Backend* bn, float slope) : Execution(bn) { @@ -71,7 +83,11 @@ public: int threads_num = runtime->threads_num(); auto input = inputs[0]->deviceId(); auto output = outputs[0]->deviceId(); - RELU<<>>((float*)input, (float*)output, count, mSlope); + if (static_cast(backend())->useFp16()) { + RELU_Half<<>>((half*)input, (half*)output, count, mSlope); + } else { + RELU<<>>((float*)input, (float*)output, count, mSlope); + } return NO_ERROR; } private: @@ -79,7 +95,8 @@ private: }; -__global__ void CLAMP(const float *input, float *output, size_t count, float minV, float maxV) { +template +__global__ void CLAMP(const T *input, T *output, size_t count, float minV, float maxV) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { float x = input[i]; float y = min(max(x, minV), maxV); @@ -101,7 +118,11 @@ public: int threads_num = runtime->threads_num(); auto input = inputs[0]->deviceId(); auto output = outputs[0]->deviceId(); - CLAMP<<>>((float*)input, (float*)output, count, mMinV, mMaxV); + if (static_cast(backend())->useFp16()) { + CLAMP<<>>((half*)input, (half*)output, count, mMinV, mMaxV); + } else { + CLAMP<<>>((float*)input, (float*)output, count, mMinV, mMaxV); + } return NO_ERROR; } private: @@ -117,6 +138,14 @@ __global__ void CAST(T1 *input, T2 *output, size_t count) { return; } +template +__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) { + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { + output[i] = (T2)((float)input[i]); + } + return; +} + __global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) { output[i] = input[i] > 0 ? 1 : 0; @@ -152,29 +181,52 @@ public: auto dstT = _mapDataType(mDst); const auto &inputDataType = inputs[0]->getType(); - + if (inputs[0]->buffer().type == outputs[0]->buffer().type) { + runtime->memcpy((void*)output, (void*)input, count * static_cast(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true); + return NO_ERROR; + } if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) { CASTBOOL<<>>((int32_t*)input, (int32_t*)output, count); - } else if (inputs[0]->buffer().type == outputs[0]->buffer().type) { - runtime->memcpy((void*)output, (void*)input, count * inputDataType.bytes(), MNNMemcpyDeviceToDevice, true); - } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CAST<<>>((float*)input, (int*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CAST<<>>((int*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CAST<<>>((uint8_t*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { - CAST<<>>((int8_t*)input, (float*)output, count); - } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { - CAST<<>>((float*)input, (int8_t*)output, count); - } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { - CAST<<>>((float*)input, (uint8_t*)output, count); + return NO_ERROR; + } + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CAST<<>>((int8_t*)input, (int32_t*)output, count); + return NO_ERROR; } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { CAST<<>>((int32_t*)input, (uint8_t*)output, count); + return NO_ERROR; } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { CAST<<>>((uint8_t*)input, (int32_t*)output, count); - } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { - CAST<<>>((int8_t*)input, (int32_t*)output, count); + return NO_ERROR; + } + if (static_cast(backend())->useFp16()) { + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (int*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int*)input, (half*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((uint8_t*)input, (half*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int8_t*)input, (half*)output, count); + } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (int8_t*)output, count); + } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((half*)input, (uint8_t*)output, count); + } + } else { + if (dstT == MNN::DataType_DT_INT32 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (int*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int*)input, (float*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((uint8_t*)input, (float*)output, count); + } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((int8_t*)input, (float*)output, count); + } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (int8_t*)output, count); + } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of() == inputDataType) { + CASTMIDFLOAT<<>>((float*)input, (uint8_t*)output, count); + } } return NO_ERROR; } diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp index 8fc6495e..d2cca918 100644 --- a/source/backend/metal/MetalBackend.hpp +++ b/source/backend/metal/MetalBackend.hpp @@ -95,8 +95,8 @@ public: // Do nothing } virtual ~ MetalRuntimeAllocator() = default; - virtual std::pair onAlloc(int size, int align) override; - virtual void onRelease(std::pair ptr) override; + virtual std::pair onAlloc(size_t size, size_t align) override; + virtual void onRelease(std::pair ptr) override; private: id mDevice; diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm index 96a49870..ba697a8d 100644 --- a/source/backend/metal/MetalBackend.mm +++ b/source/backend/metal/MetalBackend.mm @@ -841,12 +841,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache return setCache(std::make_pair(buffer, size)); } -std::pair MetalRuntimeAllocator::onAlloc(int size, int align) { +std::pair MetalRuntimeAllocator::onAlloc(size_t size, size_t align) { auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache]; auto mMetalBufferAlloc = new MetalBufferAlloc(buffer); return std::make_pair((void *)mMetalBufferAlloc, 0); } -void MetalRuntimeAllocator::onRelease(std::pair ptr) { +void MetalRuntimeAllocator::onRelease(std::pair ptr) { delete (MetalBufferAlloc *)ptr.first; } diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp index dc402dd9..dd557294 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp @@ -579,21 +579,21 @@ cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue queue, //} // clSVMAlloc wrapper, use OpenCLWrapper function. -void *clSVMAlloc(cl_context context, cl_mem_flags flags, size_t size, cl_uint align) { +void* CL_API_CALL clSVMAlloc(cl_context context, cl_mem_flags flags, size_t size, cl_uint align) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSVMAlloc; MNN_CHECK_NOTNULL(func); return func(context, flags, size, align); } // clSVMFree wrapper, use OpenCLWrapper function. -void clSVMFree(cl_context context, void *buffer) { +void CL_API_CALL clSVMFree(cl_context context, void *buffer) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSVMFree; MNN_CHECK_NOTNULL(func); func(context, buffer); } // clEnqueueSVMMap wrapper, use OpenCLWrapper function. -cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_flags flags, void *host_ptr, +cl_int CL_API_CALL clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_flags flags, void *host_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueSVMMap; MNN_CHECK_NOTNULL(func); @@ -601,7 +601,7 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_ } // clEnqueueSVMUnmap wrapper, use OpenCLWrapper function. -cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint num_events_in_wait_list, +cl_int CL_API_CALL clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueSVMUnmap; MNN_CHECK_NOTNULL(func); @@ -609,7 +609,7 @@ cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint } // clSetKernelArgSVMPointer wrapper, use OpenCLWrapper function. -cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, const void *host_ptr) { +cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, const void *host_ptr) { auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSetKernelArgSVMPointer; MNN_CHECK_NOTNULL(func); return func(kernel, index, host_ptr); diff --git a/source/backend/vulkan/component/VulkanMemoryPool.cpp b/source/backend/vulkan/component/VulkanMemoryPool.cpp index 993fc69b..2d97e118 100644 --- a/source/backend/vulkan/component/VulkanMemoryPool.cpp +++ b/source/backend/vulkan/component/VulkanMemoryPool.cpp @@ -25,7 +25,7 @@ public: virtual ~ VulkanAllocator() { // Do nothing } - virtual std::pair onAlloc(int size, int align) override { + virtual std::pair onAlloc(size_t size, size_t align) override { VkMemoryAllocateInfo info; info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; info.pNext = nullptr; @@ -34,7 +34,7 @@ public: auto mem = new VulkanMemory(mDevice, info); return std::make_pair(mem, 0); } - virtual void onRelease(std::pair ptr) override { + virtual void onRelease(std::pair ptr) override { auto p = (VulkanMemory*)ptr.first; delete p; } diff --git a/source/common/WinogradInt8Helper.hpp b/source/common/WinogradInt8Helper.hpp index 3a370c50..6cc92ec4 100644 --- a/source/common/WinogradInt8Helper.hpp +++ b/source/common/WinogradInt8Helper.hpp @@ -13,7 +13,7 @@ #include "core/Macro.h" namespace MNN { -class MNN_PUBLIC WinogradInt8Helper { +class WinogradInt8Helper { public: static void transformWeight(const std::vector& weight, std::vector& transWeight, std::vector& attrs, int oc, int ic, int kernelY, int kernelX) { diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp index c168b868..57a1a396 100644 --- a/source/core/BufferAllocator.cpp +++ b/source/core/BufferAllocator.cpp @@ -20,10 +20,10 @@ public: virtual ~ DefaultAllocator() { // Do nothing } - virtual std::pair onAlloc(int size, int align) { + virtual std::pair onAlloc(size_t size, size_t align) { return std::make_pair(MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT), 0); } - virtual void onRelease(std::pair ptr) { + virtual void onRelease(std::pair ptr) { MNN_ASSERT(ptr.second == 0); MNNMemoryFreeAlign(ptr.first); } @@ -36,10 +36,10 @@ public: virtual ~ RecurseAllocator() { // Do nothing } - virtual std::pair onAlloc(int size, int align) override { + virtual std::pair onAlloc(size_t size, size_t align) override { return mParent->alloc(size, false, align); } - virtual void onRelease(std::pair ptr) override { + virtual void onRelease(std::pair ptr) override { mParent->free(ptr); } private: @@ -62,7 +62,7 @@ BufferAllocator::Node::~Node() { outside->onRelease(pointer); } } -std::pair BufferAllocator::alloc(int size, bool seperate, int align) { +std::pair BufferAllocator::alloc(size_t size, bool seperate, size_t align) { #ifdef DUMP_USAGE auto memoryUsed = size / 1024.0f / 1024.0f; MNN_PRINT("Alloc: %f\n", memoryUsed); @@ -70,7 +70,7 @@ std::pair BufferAllocator::alloc(int size, bool seperate, int align) if (0 == align) { align = mAlign; } - std::pair pointer; + std::pair pointer; // reuse if possible if (!seperate) { if (nullptr != mCurrentFreeList) { @@ -138,7 +138,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, SharedPtr node, bool p } } -bool BufferAllocator::free(std::pair pointer) { +bool BufferAllocator::free(std::pair pointer) { // get node auto x = mUsedList.find(pointer); if (x == mUsedList.end()) { @@ -202,11 +202,11 @@ void BufferAllocator::endGroup() { mCurrentFreeList = nullptr; } -std::pair BufferAllocator::getFromFreeList(FREELIST* list, int size, bool permiteSplit, int align) { +std::pair BufferAllocator::getFromFreeList(FREELIST* list, size_t size, bool permiteSplit, size_t align) { #ifdef MNN_DEBUG_MEMORY return std::make_pair(nullptr, 0); #endif - int realSize = size; + size_t realSize = size; bool needExtraSize = mAlign % align != 0; if (needExtraSize) { realSize = size + align - 1; @@ -220,7 +220,7 @@ std::pair BufferAllocator::getFromFreeList(FREELIST* list, int size, auto pointer = x->second->pointer; // Align offset if (needExtraSize) { - int originOffset = pointer.second; + size_t originOffset = pointer.second; pointer.second = UP_DIV(originOffset, align) * align; realSize = size + pointer.second - originOffset; } diff --git a/source/core/BufferAllocator.hpp b/source/core/BufferAllocator.hpp index 1cb71817..447a370c 100644 --- a/source/core/BufferAllocator.hpp +++ b/source/core/BufferAllocator.hpp @@ -25,8 +25,8 @@ public: public: Allocator() = default; virtual ~ Allocator() = default; - virtual std::pair onAlloc(int size, int align) = 0; - virtual void onRelease(std::pair ptr) = 0; + virtual std::pair onAlloc(size_t size, size_t align) = 0; + virtual void onRelease(std::pair ptr) = 0; static std::shared_ptr createDefault(); static std::shared_ptr createRecurse(BufferAllocator* parent); }; @@ -34,7 +34,7 @@ public: * @brief init buffer allocator with pointer alignment. * @param align given pointer alignment. */ - BufferAllocator(std::shared_ptr parent, int align = MNN_MEMORY_ALIGN_DEFAULT) : mAllocator(parent), mAlign(align) { + BufferAllocator(std::shared_ptr parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT) : mAllocator(parent), mAlign(align) { // nothing to do } /** @@ -53,7 +53,7 @@ public: * @sa free * @sa release */ - std::pair alloc(int size, bool seperate = false, int align = 0); + std::pair alloc(size_t size, bool seperate = false, size_t align = 0); /** * @brief mark CHUNK pointer as reusable. @@ -61,7 +61,7 @@ public: * @return true if pointer is a CHUNK pointer, false otherwise. * @sa release */ - bool free(std::pair pointer); + bool free(std::pair pointer); /** * @brief free all allocated memories. @@ -96,26 +96,26 @@ private: class Node : public RefCount { public: ~Node(); - std::pair pointer; + std::pair pointer; SharedPtr parent = nullptr; - int32_t size; - int16_t useCount = 0; + size_t size; + size_t useCount = 0; Allocator* outside = nullptr; }; typedef std::multimap> FREELIST; static void returnMemory(FREELIST* list, SharedPtr node, bool permitMerge = true); - std::pair getFromFreeList(FREELIST* list, int size, bool permiteSplit, int align); + std::pair getFromFreeList(FREELIST* list, size_t size, bool permiteSplit, size_t align); - std::map, SharedPtr> mUsedList; + std::map, SharedPtr> mUsedList; FREELIST mFreeList; size_t mTotalSize = 0; FREELIST* mCurrentFreeList = nullptr; std::vector> mGroups; std::shared_ptr mAllocator; - int mAlign; + size_t mAlign; }; } // namespace MNN #endif diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp index 04a94086..ca12094a 100644 --- a/source/core/Interpreter.cpp +++ b/source/core/Interpreter.cpp @@ -18,6 +18,7 @@ #include "core/Pipeline.hpp" #include "core/RuntimeFactory.hpp" #include "core/Session.hpp" +#include #ifdef MNN_INTERNAL_ENABLED #include "internal/auth/ModelAuth.hpp" @@ -37,6 +38,8 @@ struct Content { std::string cacheFile; std::mutex lock; size_t lastCacheSize = 0; + std::string bizCode; + std::string uuid; }; static void writeCacheFile(const Content *net, std::pair buffer) { @@ -215,6 +218,9 @@ ErrorCode Interpreter::updateCacheFile(Session *session, int flag) { Interpreter::Interpreter(Content* net) { MNN_ASSERT(nullptr != net); mNet = net; + // Store bizcode and uuid because we need them even after `releaseModel` is called. + mNet->bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : ""); + mNet->uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : ""); } Interpreter::~Interpreter() { @@ -296,12 +302,10 @@ Session* Interpreter::createMultiPathSession(const std::vector& mNet->sessions.emplace_back(std::move(newSession)); #ifdef MNN_INTERNAL_ENABLED - std::string bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : ""); - std::string uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : ""); std::map metrics; - metrics.emplace("Model_UUID", uuid); - metrics.emplace("Model_BizCode", bizCode); - metrics.emplace("Event", "CreateSession"); + metrics.emplace("Model_UUID", mNet->uuid); + metrics.emplace("Model_BizCode", mNet->bizCode); + metrics.emplace("Event", "CREATE_SESSION"); metrics.emplace("Backend", std::to_string(configs[0].type)); metrics.emplace("Precision", configs[0].backendConfig ? std::to_string(configs[0].backendConfig->precision) : ""); metrics.emplace("API", "Interpreter::createMultiPathSession"); @@ -342,7 +346,32 @@ bool Interpreter::releaseSession(Session* session) { } ErrorCode Interpreter::runSession(Session* session) const { - return session->run(); + Timer timer; + ErrorCode errorcode = session->run(); + +#ifdef MNN_INTERNAL_ENABLED + int backendType[MNN_FORWARD_ALL] ; + session->getInfo(MNN::Interpreter::BACKENDS, backendType); + + // Only log the performance of CPU backend inference. + if (backendType[0] == MNN_FORWARD_CPU) { + float costTime = (float)timer.durationInUs() / (float)1000; + std::map metrics; + metrics.emplace("Model_UUID", mNet->uuid); + metrics.emplace("Model_BizCode", mNet->bizCode); + metrics.emplace("Event", "RUN_SESSION"); + metrics.emplace("Backend", std::to_string(MNN_FORWARD_CPU)); // "Precision" is not logged here. Don't need it. + metrics.emplace("InferTimeMs", std::to_string(costTime)); + metrics.emplace("ErrorCode", std::to_string(errorcode)); + metrics.emplace("API", "Interpreter::runSession"); + auto basicMetrics = getBasicLoggingData(); + metrics.insert(basicMetrics.begin(), basicMetrics.end()); + logAsync(metrics); + return errorcode; + } +#endif // MNN_INTERNAL_ENABLED + + return errorcode; } Tensor* Interpreter::getSessionInput(const Session* session, const char* name) { @@ -405,7 +434,33 @@ ErrorCode Interpreter::runSessionWithCallBack(const Session* session, const Tens ErrorCode Interpreter::runSessionWithCallBackInfo(const Session* session, const TensorCallBackWithInfo& before, const TensorCallBackWithInfo& callBack, bool sync) const { - return session->runWithCallBack(before, callBack, sync); + + Timer timer; + ErrorCode errorcode = session->runWithCallBack(before, callBack, sync); + +#ifdef MNN_INTERNAL_ENABLED + int backendType[MNN_FORWARD_ALL]; + session->getInfo(MNN::Interpreter::BACKENDS, backendType); + + // Only log the performance of CPU backend inference. + if (backendType[0] == MNN_FORWARD_CPU) { + float costTime = (float)timer.durationInUs() / (float)1000; + std::map metrics; + metrics.emplace("Model_UUID", mNet->uuid); + metrics.emplace("Model_BizCode", mNet->bizCode); + metrics.emplace("Event", "RUN_SESSION"); + metrics.emplace("Backend", std::to_string(MNN_FORWARD_CPU)); // "Precision" is not logged here. Don't need it. + metrics.emplace("InferTimeMs", std::to_string(costTime)); + metrics.emplace("ErrorCode", std::to_string(errorcode)); + metrics.emplace("API", "Interpreter::runSessionWithCallBackInfo"); + auto basicMetrics = getBasicLoggingData(); + metrics.insert(basicMetrics.begin(), basicMetrics.end()); + logAsync(metrics); + return errorcode; + } +#endif // MNN_INTERNAL_ENABLED + + return errorcode; } const Backend* Interpreter::getBackend(const Session* session, const Tensor* tensor) const { @@ -461,8 +516,11 @@ void Interpreter::resizeTensor(Tensor* tensor, const std::vector& dims) { } const char* Interpreter::bizCode() const { - const flatbuffers::String* code = mNet->net->bizCode(); - return code ? code->c_str() : ""; + return mNet->bizCode.c_str(); +} + +const char* Interpreter::uuid() const { + return mNet->uuid.c_str(); } std::pair Interpreter::getModelBuffer() const { diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp index eab060b9..c0796632 100644 --- a/source/cv/ImageProcess.cpp +++ b/source/cv/ImageProcess.cpp @@ -65,16 +65,6 @@ ImageProcess::ImageProcess(const Config& config) { ImageProcess* ImageProcess::create(const Config& config, const Tensor* dstTensor) { // TODO Get dstTensor' backend - #ifdef _MSC_VER - auto cpuFlags = libyuv::InitCpuFlags(); - bool support = true; - support = support && (cpuFlags & libyuv::kCpuHasSSSE3); // _mm_shuffle_epi8 - support = support && (cpuFlags & libyuv::kCpuHasSSE41); // _mm_cvtepu8_epi32 - if (!support) { - MNN_ERROR("CPU must support SSSE3 and SSE4.1 for using ImageProcess\n"); - return nullptr; - } - #endif return new ImageProcess(config); } @@ -192,12 +182,23 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid if (0 == oc) { oc = _getBpp(mInside->config.destFormat); } - auto ins = { createImageTensor(halide_type_of(), iw, ih, ic, (void*)source) }; - auto outs = { createImageTensor(type, ow, oh, oc, dest) }; + std::unique_ptr input(createImageTensor(halide_type_of(), iw, ih, ic, (void*)source)), + output(createImageTensor(type, ow, oh, oc, dest)); + auto ins = { input.get() }; + auto outs = { output.get() }; mInside->execution->setPadVal(this->mPaddingValue); mInside->execution->onResize(ins, outs); mInside->execution->onExecute(ins, outs); return NO_ERROR; } + +void ImageProcess::draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color) { + std::unique_ptr imgTensor(createImageTensor(halide_type_of(), w, h, c, (void*)img)), + regionTensor(Tensor::create(std::vector{num, 3}, halide_type_of(), (void*)regions)), + colorTensor(Tensor::create(std::vector{c}, halide_type_of(), (void*)color)); + auto ins = { imgTensor.get(), regionTensor.get(), colorTensor.get() }; + mInside->execution->onResize(ins, {}); + mInside->execution->onExecute(ins, {}); +} } // namespace CV } // namespace MNN diff --git a/source/geometry/GeometryGather.cpp b/source/geometry/GeometryGather.cpp index a7d31d12..9c49ef75 100644 --- a/source/geometry/GeometryGather.cpp +++ b/source/geometry/GeometryGather.cpp @@ -146,10 +146,10 @@ public: auto size = (int*)rgcmd->size()->data(); size[0] = outside; size[2] = inside; - auto view0Stride = (int*)rgcmd->view()->GetAs(0)->stride(); + auto view0Stride = (int*)rgcmd->view()->GetAs(0)->stride()->data(); view0Stride[0] = inside * N; view0Stride[1] = inside; - auto view1Stride = (int*)rgcmd->view()->GetAs(1)->stride(); + auto view1Stride = (int*)rgcmd->view()->GetAs(1)->stride()->data(); view1Stride[0] = inside * params->length(axis); view1Stride[1] = inside; return true; diff --git a/source/geometry/GeometryOPRegister.cpp b/source/geometry/GeometryOPRegister.cpp index fcd7163b..3a6c414d 100644 --- a/source/geometry/GeometryOPRegister.cpp +++ b/source/geometry/GeometryOPRegister.cpp @@ -13,7 +13,6 @@ extern void ___GeometryBroadcastTo___create__(); extern void ___GeometryConvert___create__(); extern void ___GeometryCosineSimilarity___create__(); extern void ___GeometryImageOp___create__(); -extern void ___GeometryGather___create__(); extern void ___GeometryCrop___create__(); extern void ___GeometryStridedSlice___create__(); extern void ___GeometrySelect___create__(); @@ -53,7 +52,6 @@ ___GeometryBroadcastTo___create__(); ___GeometryConvert___create__(); ___GeometryCosineSimilarity___create__(); ___GeometryImageOp___create__(); -___GeometryGather___create__(); ___GeometryCrop___create__(); ___GeometryStridedSlice___create__(); ___GeometrySelect___create__(); diff --git a/source/geometry/GeometrySelect.cpp b/source/geometry/GeometrySelect.cpp index 64e2f3eb..d9a0c120 100644 --- a/source/geometry/GeometrySelect.cpp +++ b/source/geometry/GeometrySelect.cpp @@ -26,7 +26,7 @@ public: if (outputSize != inputL0) { std::shared_ptr newTensor(new Tensor); TensorUtils::copyShape(output, newTensor.get(), true); - newTensor->buffer().type = output->buffer().type; + newTensor->buffer().type = input0->buffer().type; ConvertUtils::broadcastto(input0, newTensor.get()); input0 = newTensor.get(); res.extras.emplace_back(newTensor); diff --git a/source/geometry/GeometryShape.cpp b/source/geometry/GeometryShape.cpp index e37e20db..2f31becd 100644 --- a/source/geometry/GeometryShape.cpp +++ b/source/geometry/GeometryShape.cpp @@ -221,6 +221,47 @@ public: } }; +class GeometryRaster : public GeometryComputer { +public: + virtual bool onCompute(const Op* op, const std::vector& inputs, const std::vector& outputs, + Context& context, CommandBuffer& res) const override { + auto extra = op->main_as_Extra(); + if (!extra) { + return true; + } + auto output = outputs[0]; + auto outputDes = TensorUtils::getDescribe(output); + outputDes->regions.resize(inputs.size()); + outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL; + for (int i = 0; i < extra->attr()->size(); i++) { + auto attr = extra->attr()->Get(i); + if (attr->key()->str() == "region") { + int len = attr->list()->i()->size(); + MNN_ASSERT(inputs.size() * 11 == len); + + for (int j = 0; j < inputs.size(); j++) { + auto& region = outputDes->regions[j]; +#define _GET(x) attr->list()->i()->Get(j * 11 + x) + region.src.offset = _GET(0); + region.src.stride[0] = _GET(1); + region.src.stride[1] = _GET(2); + region.src.stride[2] = _GET(3); + region.dst.offset = _GET(4); + region.dst.stride[0] = _GET(5); + region.dst.stride[1] = _GET(6); + region.dst.stride[2] = _GET(7); + region.size[0] = _GET(8); + region.size[1] = _GET(9); + region.size[2] = _GET(10); + region.origin = inputs[j]; +#undef _GET + } + } + } + return true; + } +}; + static void _create() { std::shared_ptr comp(new GeometryShape); GeometryComputer::registerGeometryComputer(comp, {OpType_Shape}); @@ -230,6 +271,8 @@ static void _create() { GeometryComputer::registerGeometryComputer(comp2, {OpType_PriorBox}); std::shared_ptr comp3(new GeometrySize); GeometryComputer::registerGeometryComputer(comp3, {OpType_Size}); + std::shared_ptr comp4(new GeometryRaster); + GeometryComputer::registerGeometryComputer(comp4, {OpType_Raster}); } REGISTER_GEOMETRY(GeometryShape, _create); diff --git a/source/geometry/GeometryStridedSlice.cpp b/source/geometry/GeometryStridedSlice.cpp index 15185163..532a1ec3 100644 --- a/source/geometry/GeometryStridedSlice.cpp +++ b/source/geometry/GeometryStridedSlice.cpp @@ -9,6 +9,7 @@ #include "geometry/GeometryComputer.hpp" #include "core/OpCommonUtils.hpp" #include "core/Macro.h" +#include "ConvertUtils.hpp" namespace MNN { class GeometryStridedSlice : public GeometryComputer { public: @@ -247,6 +248,31 @@ public: reg.dst.stride[1] = reg.size[2]; reg.dst.stride[2] = 1; } + if (inputs.size() == 5) { + auto write = inputs[4]; + std::vector shape(outputShape, outputShape + shapeNum); + if (write->shape() != shape) { + std::shared_ptr newTensor(new Tensor); + newTensor->buffer().type = write->buffer().type; + newTensor->buffer().dimensions = shapeNum; + for (int i = 0; i < shapeNum; i++) { + newTensor->setLength(i, outputShape[i]); + } + ConvertUtils::broadcastto(write, newTensor.get()); + write = newTensor.get(); + res.extras.emplace_back(newTensor); + } + for (auto& reg : outputDes->regions) { + auto tmp = reg.dst; + reg.dst = reg.src; + reg.src = tmp; + reg.origin = write; + } + Tensor::InsideDescribe::Region region; + region.size[2] = input->elementSize(); + region.origin = input; + outputDes->regions.insert(outputDes->regions.begin(), region); + } return true; } }; diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp index ea3d8590..6473a9db 100644 --- a/source/shape/ShapeRegister.cpp +++ b/source/shape/ShapeRegister.cpp @@ -1,6 +1,7 @@ // This file is generated by Shell for ops register namespace MNN { extern void ___ShapeSizeComputer__OpType_Shape__(); +extern void ___ShapeRasterComputer__OpType_Raster__(); extern void ___PriorBoxComputer__OpType_PriorBox__(); extern void ___ShapeBroadcastTo__OpType_BroadcastTo__(); extern void ___InterpComputer__OpType_Interp__(); @@ -106,6 +107,7 @@ extern void ___DeconvolutionSizeComputer__OpType_DeconvolutionDepthwise__(); void registerShapeOps() { ___ShapeSizeComputer__OpType_Shape__(); +___ShapeRasterComputer__OpType_Raster__(); ___PriorBoxComputer__OpType_PriorBox__(); ___ShapeBroadcastTo__OpType_BroadcastTo__(); ___InterpComputer__OpType_Interp__(); diff --git a/source/shape/ShapeReshape.cpp b/source/shape/ShapeReshape.cpp index d4389d86..a5bb9631 100644 --- a/source/shape/ShapeReshape.cpp +++ b/source/shape/ShapeReshape.cpp @@ -100,9 +100,7 @@ public: int totalSizeInput = 1; for (int i = 0; i < input->buffer().dimensions; ++i) { auto l = input->length(i); - if (l != 0) { - totalSizeInput *= l; - } + totalSizeInput *= l; } int determinAxis = -1; diff --git a/source/shape/ShapeResize.cpp b/source/shape/ShapeResize.cpp index 6ac9f16c..9bfae8a7 100644 --- a/source/shape/ShapeResize.cpp +++ b/source/shape/ShapeResize.cpp @@ -39,8 +39,14 @@ class ResizeComputer : public SizeComputer { class ImageProcessComputer : public SizeComputer { virtual bool onComputeSize(const MNN::Op *op, const std::vector &inputs, const std::vector &outputs) const override { - MNN_ASSERT(1 == inputs.size()); + MNN_ASSERT(1 == inputs.size() || inputs.size() == 3); MNN_ASSERT(1 == outputs.size()); + if (inputs.size() == 3) { + auto &output = outputs[0]->buffer(); + output.dimensions = 1; + output.dim[0].extent = 1; + return true; + } // copy dims auto &input = inputs[0]->buffer(); diff --git a/source/shape/ShapeScatterNd.cpp b/source/shape/ShapeScatterNd.cpp index 68bc107c..97d58bad 100644 --- a/source/shape/ShapeScatterNd.cpp +++ b/source/shape/ShapeScatterNd.cpp @@ -15,7 +15,7 @@ namespace MNN { class ShapeScatterNd : public SizeComputer { bool onComputeSize(const MNN::Op *op, const std::vector &inputs, const std::vector &outputs) const override { - MNN_ASSERT(3 == inputs.size()); + MNN_ASSERT(3 <= inputs.size()); auto indices = inputs[0]; auto updates = inputs[1]; auto shape = inputs[2]; diff --git a/source/shape/ShapeShape.cpp b/source/shape/ShapeShape.cpp index 3ef775df..eadc18f6 100644 --- a/source/shape/ShapeShape.cpp +++ b/source/shape/ShapeShape.cpp @@ -35,4 +35,32 @@ class ShapeSizeComputer : public SizeComputer { }; REGISTER_SHAPE(ShapeSizeComputer, OpType_Shape); + +class ShapeRasterComputer : public SizeComputer { + virtual bool onComputeSize(const MNN::Op* op, const std::vector& inputs, + const std::vector& outputs) const override { + MNN_ASSERT(1 <= inputs.size()); + MNN_ASSERT(1 == outputs.size()); + outputs[0]->buffer().type = inputs[0]->buffer().type; + auto extra = op->main_as_Extra(); + if (!extra) { + // copy dims + TensorUtils::copyShape(inputs[0], outputs[0], true); + } else { + for (int i = 0; i < extra->attr()->size(); i++) { + auto attr = extra->attr()->Get(i); + if (attr->key()->str() == "shape") { + int len = attr->list()->i()->size(); + outputs[0]->buffer().dimensions = len; + for (int j = 0; j < len; j++) { + outputs[0]->setLength(j, attr->list()->i()->Get(j)); + } + } + } + } + return true; + } +}; + +REGISTER_SHAPE(ShapeRasterComputer, OpType_Raster); } // namespace MNN diff --git a/source/shape/ShapeStridedSlice.cpp b/source/shape/ShapeStridedSlice.cpp index 891420f1..1cdfaf6a 100644 --- a/source/shape/ShapeStridedSlice.cpp +++ b/source/shape/ShapeStridedSlice.cpp @@ -16,7 +16,14 @@ class StridedSliceComputer : public SizeComputer { public: virtual bool onComputeSize(const MNN::Op *op, const std::vector &inputs, const std::vector &outputs) const override { - MNN_ASSERT(4 == inputs.size()); + // write to input + if (inputs.size() == 5) { + TensorUtils::copyShape(inputs[0], outputs[0], true); + outputs[0]->buffer().type = inputs[0]->buffer().type; + TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat; + return true; + } + MNN_ASSERT(4 <= inputs.size()); MNN_ASSERT(1 == outputs.size()); Tensor *input = inputs[0]; diff --git a/source/shape/ShapeWhere.cpp b/source/shape/ShapeWhere.cpp index 26c00d2b..d7aa1f72 100644 --- a/source/shape/ShapeWhere.cpp +++ b/source/shape/ShapeWhere.cpp @@ -39,8 +39,10 @@ class WhereSizeComputer : public SizeComputer { // support old version return true; } - // For compability + // For zeroshape input if (nullptr == inputs[0]->host()) { + ob.dimensions = 1; + ob.dim[0].extent = 0; return true; } int count = 0; @@ -56,6 +58,9 @@ class WhereSizeComputer : public SizeComputer { if (count > 0) { ob.dim[0].extent = count; + } else { + ob.dimensions = 1; + ob.dim[0].extent = 0; } return true; } diff --git a/test.bat b/test.bat new file mode 100644 index 00000000..85b21099 --- /dev/null +++ b/test.bat @@ -0,0 +1,7 @@ +if %1 EQU x86 ( + @call "%vs_env_setup%/vcvarsamd64_x86.bat" + powershell "%~dp0test.ps1" -gpu -x86 +) else ( + @call "%vs_env_setup%/vcvars64.bat" + powershell "%~dp0test.ps1" -gpu +) \ No newline at end of file diff --git a/test.ps1 b/test.ps1 new file mode 100644 index 00000000..3799f5ed --- /dev/null +++ b/test.ps1 @@ -0,0 +1,233 @@ +# Powershell Script must be save as UTF-8 with BOM, otherwise system-wide code page will be used, causing garbled code + +# MNN-CPU-GPU +# |-- include +# |-- lib +# | |-- x64 +# | | |-- (Debug/Release x Dynamic/Static x MD/MT) +# | | +# | |-- x86 +# | |-- (Debug/Release x Dynamic/Static x MD/MT) +# | +# |-- tools (Release + Dynamic + MD) +# | |-- x64 +# | |-- x86 +# | +# |-- py_whl +# |-- py_bridge +# |-- include +# |-- wrapper +# |-- test (Release + Dynamic + MD) +# |-- x64 +# |-- x86 +# |-- lib +# |-- x64 +# | |-- (Debug/Release x Dynamic/Static x MD/MT) +# | +# |-- x86 +# |-- (Debug/Release x Dynamic/Static x MD/MT) + +Param( + [Switch]$gpu, + [Switch]$x86 +) + +$basedir = $(Split-Path -Parent $MyInvocation.MyCommand.Path) +$outdir = "$basedir/$(If ($gpu) {"MNN-CPU-GPU"} Else {"MNN-CPU"})" +$arch = "$(If ($x86) {"x86"} Else {"x64"})" +Write-Output $arch + +$test_avx512 = ((!$x86) -and $env:avx512_server -and $env:avx512_password) +if ($test_avx512) { + $remote_home = $(Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password powershell "echo `$HOME"') + $remote_dir = "${remote_home}\cise-space\$(Split-Path -Path $(pushd .. ; pwd ; popd) -Leaf)" +} +function sync_remote() { + Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password powershell "Remove-Item -Recurse $remote_dir -ErrorAction Ignore ; mkdir $remote_dir"' + Invoke-Expression 'pscp -pw $env:avx512_password -r $outdir/tools ${env:avx512_server}:${remote_dir}' + Invoke-Expression 'pscp -pw $env:avx512_password tools/script/modelTest.py ${env:avx512_server}:${remote_dir}' +} + +function run_remote([String]$cmd) { + $tmpfile = New-TemporaryFile + Set-Content -Path $tmpfile -Value "powershell `"cd ${remote_dir} ; $cmd`"" + $output = $(Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password -m $tmpfile') + Remove-Item $tmpfile + return $output +} + +function log($case, $title, $blocked, $failed, $passed, $skipped) { + Write-Output "TEST_NAME_${case}: $title\nTEST_CASE_AMOUNT_${case}: {`"blocked`":$blocked,`"failed`":$failed,`"passed`":$passed,`"skipped`":$skipped}\n" +} + +function failed() { + Write-Output "TEST_NAME_EXCEPTION: Exception" + Write-Output 'TEST_CASE_AMOUNT_EXCEPTION: {"blocked":0,"failed":1,"passed":0,"skipped":0}' + exit 1 +} + +function build_lib_test() { + Invoke-Expression "./package_scripts/win/build_lib.ps1 -path $outdir $(If ($gpu) {"-backends 'opencl,vulkan'"}) $(If ($x86) {'-x86'})" + $WrongNum = $($LastExitCode -ne 0) + log "WINDOWS_LIB" "Windows主库编译测试" 0 $WrongNum $(1 - $WrongNum) 0 + if ($WrongNum -ne 0) { + Write-Output "### Windows主库编译测试失败,测试终止" + failed + } +} + +function build_tool_test() { + Invoke-Expression "./package_scripts/win/build_tools.ps1 -path $outdir/tools/$arch $(If ($gpu) {"-backends 'opencl,vulkan'"}) -build_all -dynamic_link" + $WrongNum = $($LastExitCode -ne 0) + log "WINDOWS_LIB" "Windows工具编译测试" 0 $WrongNum $(1 - $WrongNum) 0 + if ($WrongNum -ne 0) { + Write-Output "### Windows工具编译测试失败,测试终止" + failed + } +} + +function build_whl_test() { + $pyenvs = "py27,py37,py38,py39" + if ($x86) { + $pyenvs = "py27-win32,py37-win32,py38-win32,py39-win32" + } + Invoke-Expression "./package_scripts/win/build_whl.ps1 -version ci_test -path $outdir/py_whl -pyenvs '$pyenvs' $(If ($x86) {'-x86'})" + $WrongNum = $($LastExitCode -ne 0) + log "WINDOWS_LIB" "Windows pymnn wheel编译测试" 0 $WrongNum $(1 - $WrongNum) 0 + if ($WrongNum -ne 0) { + Write-Output "### Windows pymnn wheel编译测试失败,测试终止" + failed + } +} + +function build_bridge_test() { + Invoke-Expression "./package_scripts/win/build_bridge.ps1 -version ci_test -pyc_env py27 -mnn_path $outdir -python_path $HOME/PyBridgeDeps/python -numpy_path $HOME/PyBridgeDeps/numpy -path $outdir/py_bridge -train_api $(If ($x86) {'-x86'})" + $WrongNum = $($LastExitCode -ne 0) + log "WINDOWS_LIB" "Windows pymnn bridge编译测试" 0 $WrongNum $(1 - $WrongNum) 0 + if ($WrongNum -ne 0) { + Write-Output "### Windows pymnn bridge编译测试失败,测试终止" + failed + } +} + +function unit_test() { + Invoke-Expression "$outdir/tools/$arch/run_test.out.exe" + if ($LastExitCode -ne 0) { + Write-Output "### CPU后端 单元测试失败,测试终止" + failed + } + Invoke-Expression "$outdir/tools/$arch/run_test.out.exe op 0 0 4" + if ($LastExitCode -ne 0) { + Write-Output "### CPU后端 多线程测试失败,测试终止" + failed + } + if ($test_avx512) { + $RemoteExitCode = run_remote "cd tools/x64 ; ./run_test.out.exe > log.txt ; echo `$LastExitCode" + Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt") + if ($RemoteExitCode -ne 0) { + Write-Output "### CPU后端(AVX512) 单元测试失败,测试终止" + failed + } + $RemoteExitCode = run_remote "cd tools/x64 ; ./run_test.out.exe op 0 0 4 > log.txt ; echo `$LastExitCode" + Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt") + if ($RemoteExitCode -ne 0) { + Write-Output "### CPU后端(AVX512) 多线程测试失败,测试终止" + failed + } + } + #Invoke-Expression "$outdir/tools/$arch/run_test.out.exe op 3" + #if ($LastExitCode -ne 0) { + # echo "### OpenCL后端 单元测试失败,测试终止" + # failed + #} +} + +function model_test() { + Push-Location $outdir/tools/$arch + python $basedir/tools/script/modelTest.py $HOME/AliNNModel 0 0.002 + if ($LastExitCode -ne 0) { + Write-Output "### CPU后端 模型测试失败,测试终止" + Pop-Location + failed + } + python $basedir/tools/script/modelTest.py $HOME/AliNNModel 0 0.002 0 1 + if ($LastExitCode -ne 0) { + Write-Output "### CPU后端 静态模型测试失败,测试终止" + Pop-Location + failed + } + if ($test_avx512) { + $RemoteExitCode = run_remote "cd tools/x64 ; python ../../modelTest.py `$HOME/AliNNModel 0 0.002 > log.txt ; echo `$LastExitCode" + Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt") + if ($RemoteExitCode -ne 0) { + Write-Output "### CPU后端(AVX512) 模型测试失败,测试终止" + Pop-Location + failed + } + $RemoteExitCode = run_remote "cd tools/x64 ; python ../../modelTest.py `$HOME/AliNNModel 0 0.002 0 1 > log.txt ; echo `$LastExitCode" + Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt") + if ($RemoteExitCode -ne 0) { + Write-Output "### CPU后端(AVX512) 静态模型测试失败,测试终止" + Pop-Location + failed + } + } + #python $basedir/tools/script/modelTest.py $HOME/AliNNModel 3 0.01 + #if ($LastExitCode -ne 0) { + # echo "### OpenCL后端 模型测试失败,测试终止" + # Pop-Location + # failed + #} + Pop-Location +} + +function pymnn_whl_test() { + $pyarch = $(If ($x86) {"win32"} Else {"amd64"}) + Push-Location pymnn/test + $local = "$(Get-Location)/aone-site-packages" + $pythonpath_backup = ${env:PYTHONPATH} + Foreach ($pyenv in @("27", "37", "38", "39")) { + Invoke-Expression "conda activate py$pyenv$(If($x86) {'-win32'})" + Remove-Item -Recurse $local -ErrorAction Ignore + pip install --target $local $outdir/py_whl/$(Get-ChildItem -Path $outdir/py_whl -Include "*$pyenv*$pyarch*" -Name) + do { + # unit_test.py need torch, which isn't support on 32bit Windows and py27 + # https://pytorch.org/docs/stable/notes/windows.html#package-not-found-in-win-32-channel + if ($x86 -or ($pyenv -eq "27")) { + break; + } + ${env:PYTHONPATH} = $local + python unit_test.py + ${env:PYTHONPATH} = $pythonpath_backup + if ($LastExitCode -ne 0) { + Write-Output "### PYMNN单元测试失败,测试终止" + conda deactivate + Pop-Location + failed + } + } while(0); + ${env:PYTHONPATH} = "$local" + python model_test.py $HOME/AliNNModel + ${env:PYTHONPATH} = $pythonpath_backup + if ($LastExitCode -ne 0) { + Write-Output "### PYMNN模型测试失败,测试终止" + conda deactivate + Pop-Location + failed + } + conda deactivate + } + Pop-Location +} + +build_lib_test +build_tool_test +build_whl_test +build_bridge_test + +if ($test_avx512) { + sync_remote +} +unit_test +model_test +pymnn_whl_test diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 40f839d2..6e05feaf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,6 +12,9 @@ endif() add_executable(run_test.out ${Files}) target_link_libraries(run_test.out ${MNN_DEPS}) +if (WIN32) + target_compile_options(run_test.out PRIVATE /bigobj) +endif() if (MNN_SUPPORT_BF16) target_compile_options(run_test.out PRIVATE -DMNN_SUPPORT_BF16) endif() diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp index 3544e969..fa039b52 100644 --- a/test/MNNTestSuite.cpp +++ b/test/MNNTestSuite.cpp @@ -34,9 +34,9 @@ static void printTestResult(int wrong, int right, const char* flag) { printf("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right); } -void MNNTestSuite::run(const char* key, int precision, const char* flag) { +int MNNTestSuite::run(const char* key, int precision, const char* flag) { if (key == NULL || strlen(key) == 0) - return; + return 0; auto suite = MNNTestSuite::get(); std::string prefix = key; @@ -60,9 +60,10 @@ void MNNTestSuite::run(const char* key, int precision, const char* flag) { printf("Error: %s\n", wrong.c_str()); } printTestResult(wrongs.size(), runUnit - wrongs.size(), flag); + return wrongs.size(); } -void MNNTestSuite::runAll(int precision, const char* flag) { +int MNNTestSuite::runAll(int precision, const char* flag) { auto suite = MNNTestSuite::get(); std::vector wrongs; for (int i = 0; i < suite->mTests.size(); ++i) { @@ -88,4 +89,5 @@ void MNNTestSuite::runAll(int precision, const char* flag) { printf("Error: %s\n", wrong.c_str()); } printTestResult(wrongs.size(), suite->mTests.size() - wrongs.size(), flag); + return wrongs.size(); } diff --git a/test/MNNTestSuite.h b/test/MNNTestSuite.h index 568b6dde..c67cdbfa 100644 --- a/test/MNNTestSuite.h +++ b/test/MNNTestSuite.h @@ -21,6 +21,7 @@ #include #undef min #undef max +#undef NO_ERROR #else #include #include @@ -92,13 +93,13 @@ public: * @param precision. fp32 / bf16 precision should use FP32Converter[1 - 2]. * fp16 precision should use FP32Converter[3]. */ - static void runAll(int precision, const char* flag = ""); + static int runAll(int precision, const char* flag = ""); /** * @brief run test case with runtime precision, see FP32Converter in TestUtil.h. * @param precision. fp32 / bf16 precision should use FP32Converter[1 - 2]. * fp16 precision should use FP32Converter[3]. */ - static void run(const char* name, int precision, const char* flag = ""); + static int run(const char* name, int precision, const char* flag = ""); private: /** get shared instance */ diff --git a/test/core/BackendTest.cpp b/test/core/BackendTest.cpp index 603c1c31..207e451f 100644 --- a/test/core/BackendTest.cpp +++ b/test/core/BackendTest.cpp @@ -148,7 +148,7 @@ bool nhwc_2_nhwc_uint8(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); return false; } @@ -183,7 +183,7 @@ bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for NCHW Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); return false; } @@ -195,7 +195,7 @@ bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr bn) { bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get()); bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get()); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for NHWC Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); return false; } @@ -216,7 +216,7 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr bn, int batch, int width, int hei + c * height * width + y * width + x - ] = b * 100.f + c * 10.f + y * 0.1f + x * 0.001f; + ] = b / (float)batch * 100.f + c / (float)channel * 10.f + y / (float)height * 0.1f + x / (float)width * 0.001f; } } } @@ -231,8 +231,8 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr bn, int batch, int width, int hei auto backendCopyData = dstTensor->host(); auto hostData = srcTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { - MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); + if (abs(backendCopyData[i] - hostData[i]) >= 0.1f) { + MNN_PRINT("Error for bn:%d, %f -> %f, %f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); return false; } } @@ -242,8 +242,8 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr bn, int batch, int width, int hei bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { // MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n"); - std::vector nhwc_shape = {1, 224, 224, 8}; - std::vector nchw_shape = {1, 224, 8, 224}; + std::vector nhwc_shape = {1, 32, 12, 13}; + std::vector nchw_shape = {1, 12, 13, 32}; std::shared_ptr hostTensor( Tensor::create(nhwc_shape, nullptr, Tensor::CAFFE_C4)); auto elementSize = hostTensor->elementSize(); @@ -288,7 +288,7 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr bn) { bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get()); bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get()); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for NHWC Mid bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); return false; } @@ -319,7 +319,7 @@ void NC4HW4_2_NC4HW4_uint8(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], (int32_t)backendCopyData[i]); break; } @@ -433,7 +433,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr bn) { // MNN_PRINT("NC4HW4 -> nhwc !\n"); for (int i = 0; i < elementSize; ++i) { - if (abs(backendCopyData[i] - hostData[i]) >= 0.001) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); } } @@ -464,7 +464,7 @@ void nchw_2_NC4HW4_2_nchw_float(std::shared_ptr bn) { auto backendCopyData = checkHostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] != hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]); break; } @@ -510,7 +510,7 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr bn) { auto backendCopyData = NC4HW4_HostTensor->host(); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); return false; } @@ -524,9 +524,9 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr bn) { NHWC2NCHW(temp, backendCopyData, batch, height, width, channel); bn->onCopyBuffer(deviceTensor.get(), hostTensor.get()); - // MNN_PRINT("NC4HW4 -> nhwc !\n"); + // MNN_PRINT("NC4HW4 -> nhwc !\n"); for (int i = 0; i < elementSize; ++i) { - if (backendCopyData[i] != hostData[i]) { + if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]); } } @@ -534,14 +534,53 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr bn) { free(temp); return true; } +bool nchwTonhwc(std::shared_ptr bn) { + // Test NHWC -> NC4HW4 -> NHWC + MNN_PRINT("\n ========= check nchwTonhwc result ! ========= \n"); + int batch = 2; + int channel = 12; + int width = 21; + int height = 5; + std::shared_ptr hostTensor( + Tensor::create(std::vector{batch, channel, height, width}, nullptr, Tensor::CAFFE)); + auto elementSize = hostTensor->elementSize(); + auto hostData = hostTensor->host(); + for (int i = 0; i < elementSize; ++i) { + int flagRandom = (rand() % 2 == 0); + float valueRandom = rand() % 255 / 255.f; + hostData[i] = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom; + } + std::vector tempStorage(hostTensor->elementSize()); + float* temp = tempStorage.data(); + memset(temp, 0.0f, hostTensor->size()); + NCHW2NHWC(hostData, temp, batch, height, width, channel); + std::shared_ptr deviceTensor_pre(Tensor::createDevice(std::vector{batch, height, width, channel})); + bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC); + std::shared_ptr deviceTensor(Tensor::createDevice(std::vector{batch, height, width, channel})); + bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC); + bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get()); + bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get()); + std::shared_ptr hostTensorNHWC( + Tensor::create(std::vector{batch, height, width, channel}, nullptr, Tensor::TENSORFLOW)); + bn->onCopyBuffer(deviceTensor.get(), hostTensorNHWC.get()); + auto backendCopyData = hostTensorNHWC->host(); + for (int i = 0; i < elementSize; ++i) { + if (abs(backendCopyData[i] - temp[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001 + MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); + return false; + } + } + return true; +} + bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { // Test NHWC -> NC4HW4 -> NHWC MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_float result ! ========= \n"); int batch = 1; int channel = 12; - int width = 20; - int height = 20; + int width = 3; + int height = 2; std::shared_ptr hostTensor( Tensor::create(std::vector{batch, channel, height, width}, nullptr, Tensor::CAFFE)); auto elementSize = hostTensor->elementSize(); @@ -556,15 +595,12 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { memset(temp, 0.0f, hostTensor->size()); NCHW2NHWC(hostData, temp, batch, height, width, channel); - std::shared_ptr deviceTensor_pre(Tensor::createDevice(std::vector{batch, height, width, channel})); - bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC); std::shared_ptr deviceTensor(Tensor::createDevice(std::vector{batch, height, width, channel})); bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC); - bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get()); - bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get()); + bn->onCopyBuffer(hostTensor.get(), deviceTensor.get()); - // // nhwc -> NC4HW4 - // MNN_PRINT("nhwc -> NC4HW4 !\n"); + // // nhwc -> NC4HW4 + // MNN_PRINT("nhwc -> NC4HW4 !\n"); MNNTensorConvertNHWCToNC4HW4(hostData, temp, height * width, channel); std::shared_ptr NC4HW4_HostTensor( @@ -573,12 +609,20 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get()); auto backendCopyData = NC4HW4_HostTensor->host(); + bool res = true; for (int i = 0; i < elementSize; ++i) { if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001 MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); - return false; + res = false; + break; } } + if (!res) { + for (int i = 0; i < elementSize; ++i) { + MNN_PRINT("%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); + } + return false; + } // NC4HW4 -> nhwc @@ -588,10 +632,11 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr bn) { NHWC2NCHW(temp, backendCopyData, batch, height, width, channel); bn->onCopyBuffer(deviceTensor.get(), hostTensor.get()); - // MNN_PRINT("NC4HW4 -> nhwc !\n"); + MNN_PRINT("NC4HW4 -> nhwc !\n"); for (int i = 0; i < elementSize; ++i) { if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { - MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); + MNN_PRINT("NC4HW4 -> nhwc Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS); + return false; } } @@ -618,12 +663,20 @@ public: MNN_PRINT("Test %d Backend for %d \n", type, user.precision); std::shared_ptr bn(runtime->onCreate(&user)); auto res = NC4HW4_2_NC4HW4_float(bn); + FUNC_PRINT(res); + res = res && nchwTonhwc(bn); + FUNC_PRINT(res); res = res && nhwc_2_NC4HW4_2_nhwc_float(bn); + FUNC_PRINT(res); res = res && NCHW_NC4HW4_NCHW(bn, 3, 16, 17, 19); + FUNC_PRINT(res); res = res && NCHW_NC4HW4_NCHW(bn, 12, 16, 38, 16); + FUNC_PRINT(res); res = res && NCHW_NC4HW4_NCHW(bn, 5, 128, 8, 6); + FUNC_PRINT(res); if (!res) { MNN_ERROR("Error for %d bn\n", i); + return false; } } } diff --git a/test/core/BufferAllocatorTest.cpp b/test/core/BufferAllocatorTest.cpp index 40c8ef60..eb7d4112 100644 --- a/test/core/BufferAllocatorTest.cpp +++ b/test/core/BufferAllocatorTest.cpp @@ -11,7 +11,7 @@ #include "core/MNNMemoryUtils.h" using namespace MNN; - +#ifndef _MSC_VER class BufferAllocatorTest : public MNNTestCase { public: virtual ~BufferAllocatorTest() = default; @@ -56,3 +56,4 @@ public: } }; MNNTestSuiteRegister(BufferAllocatorTest, "core/buffer_allocator"); +#endif \ No newline at end of file diff --git a/test/expr/MatMulTest.cpp b/test/expr/MatMulTest.cpp index f38d27e4..e768051b 100644 --- a/test/expr/MatMulTest.cpp +++ b/test/expr/MatMulTest.cpp @@ -40,7 +40,7 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i expected += AY[k] * BX[k * e]; } auto diff = fabsf(expected - computed); - if (diff > 0.1f) { + if (diff > 0.003f * fabsf(expected)) { MNN_PRINT("%f -> %f\n", expected, computed); res = false; } diff --git a/test/expr/ZeroShapeTest.cpp b/test/expr/ZeroShapeTest.cpp index cef801e4..8f6dd9f9 100644 --- a/test/expr/ZeroShapeTest.cpp +++ b/test/expr/ZeroShapeTest.cpp @@ -19,9 +19,9 @@ public: virtual bool run(int precision) { auto input = _Input({1, 0, 4, 1}, NHWC); input->setName("input"); - auto output = _Reshape(input, {0, 0, -1}); + auto output = _Reshape(input, {1, 0, -1}); auto info = output->getInfo(); - auto rightDims = std::vector{1, 0, 4}; + auto rightDims = std::vector{1, 0, 0}; if (info->dim[0] != rightDims[0] || info->dim[1] != rightDims[1] || info->dim[2] != rightDims[2]) { return false; } diff --git a/test/main.cpp b/test/main.cpp index 79b7807a..e5d5b4b2 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -48,12 +48,12 @@ int main(int argc, char* argv[]) { if (argc > 1) { auto name = argv[1]; if (strcmp(name, "all") == 0) { - MNNTestSuite::runAll(precisionInTestUtil, flag); + return MNNTestSuite::runAll(precisionInTestUtil, flag); } else { - MNNTestSuite::run(name, precisionInTestUtil, flag); + return MNNTestSuite::run(name, precisionInTestUtil, flag); } } else { - MNNTestSuite::runAll(precisionInTestUtil, flag); + return MNNTestSuite::runAll(precisionInTestUtil, flag); } return 0; } diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp new file mode 100644 index 00000000..c2f40e4b --- /dev/null +++ b/test/op/RasterTest.cpp @@ -0,0 +1,43 @@ +// +// RasrerTest.cpp +// MNNTests +// +// Created by MNN on 2021/12/23. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include +#include "MNNTestSuite.h" +#include "TestUtils.h" + +using namespace MNN::Express; +class RasrerTest : public MNNTestCase { +public: + virtual ~RasrerTest() = default; + virtual bool run(int precision) { + auto input = _Input({2, 2}, NCHW); + input->setName("input_tensor"); + // set input data + const float inpudata[] = {1, 2, 3, 4}; + auto inputPtr = input->writeMap(); + memcpy(inputPtr, inpudata, 4 * sizeof(float)); + // transpose + auto output = _Raster({input}, {0, 4, 1, 2, 0, 4, 2, 1, 1, 2, 2}, {2, 2}); + const std::vector expectedOutput = {1, 3, 2, 4}; + auto gotOutput = output->readMap(); + if (!checkVector(gotOutput, expectedOutput.data(), 4, 0.01)) { + MNN_ERROR("RasterTest transpose test failed!\n"); + return false; + } + auto output0 = _Raster({input}, {2, 4, 2, 1, 0, 4, 2, 1, 1, 1, 2}, {2}); + const std::vector expectedOutput0 = {3, 4}; + auto gotOutput0 = output0->readMap(); + if (!checkVector(gotOutput0, expectedOutput0.data(), 2, 0.01)) { + MNN_ERROR("RasterTest slice test failed!\n"); + return false; + } + return true; + } +}; +MNNTestSuiteRegister(RasrerTest, "op/raster"); diff --git a/test/op/SelectTest.cpp b/test/op/SelectTest.cpp index 170a9fe4..52780150 100644 --- a/test/op/SelectTest.cpp +++ b/test/op/SelectTest.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include "MNNTestSuite.h" #include "MNN_generated.h" @@ -45,7 +46,7 @@ void RandInit(VARP value, T lower, T upper) { void RandInitBool(VARP value) { int* pValue = value->writeMap(); for (int i = 0; i < Size(value); ++i) { - pValue[i] = (uniform_dist(rng) > 0.f); + pValue[i] = (uniform_dist(rng) > 0.5f); } } @@ -68,11 +69,13 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) { condition = select->readMap()[i]; } if (condition) { - if (input0Ptr[i * iter0] != outputPtr[i]) { + if (fabsf(input0Ptr[i * iter0] - outputPtr[i]) >= 0.1f) { + MNN_PRINT("%d, %d - %f - %f - %f\n", i, condition, input0Ptr[i * iter0], input1Ptr[i * iter1], outputPtr[i]); return false; } } else { - if (input1Ptr[i * iter1] != outputPtr[i]) { + if (fabsf(input1Ptr[i * iter1] - outputPtr[i]) >= 0.1f) { + MNN_PRINT("%d, %d - %f - %f - %f\n", i, condition, input0Ptr[i * iter0], input1Ptr[i * iter1], outputPtr[i]); return false; } } @@ -84,11 +87,11 @@ bool SelectTester1D(int N) { auto input0 = _Input({N}, NCHW); auto input1 = _Input({N}, NCHW); { - auto select = _Input({N}, NCHW); + auto select = _Input({N}, NCHW, halide_type_of()); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } { - auto select = _Input({1}, NCHW); + auto select = _Input({1}, NCHW, halide_type_of()); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } return true; @@ -98,15 +101,15 @@ bool SelectTester4D(int N, int C, int H, int W) { auto input0 = _Input({N, C, H, W}, NCHW); auto input1 = _Input({N, C, H, W}, NCHW); { - auto select = _Input({N, C, H, W}, NCHW); + auto select = _Input({N, C, H, W}, NCHW, halide_type_of()); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } { - auto select = _Input({1}, NCHW); + auto select = _Input({1}, NCHW, halide_type_of()); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } { - auto select = _Input({N, C, H, W}, NCHW); + auto select = _Input({N, C, H, W}, NCHW, halide_type_of()); auto input0 = _Input({1}, NCHW); CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1)); } diff --git a/test/op/SortTest.cpp b/test/op/SortTest.cpp new file mode 100644 index 00000000..be6c202d --- /dev/null +++ b/test/op/SortTest.cpp @@ -0,0 +1,92 @@ +// +// SortTest.cpp +// MNNTests +// +// Created by MNN on 2021/12/22. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#include +#include +#include "MNNTestSuite.h" +#include "TestUtils.h" +using namespace MNN::Express; +class SortTest : public MNNTestCase { +public: + virtual ~SortTest() = default; + virtual bool run(int precision) { + auto input_nhwc = _Input({4, 4}, NHWC); + input_nhwc->setName("input_tensor_nhwc"); + // set input data + const float inpudata[] = {-1.0, 2.0, -3.0, 4.0, + 5.0, -6.0, 7.0, -8.0, + -9.0, -10.0, 11.0, 12.0, + 13.0, 14.0, -15.0, -16.0}; + auto inputPtr = input_nhwc->writeMap(); + memcpy(inputPtr, inpudata, 16 * sizeof(float)); + const std::vector expectedOutput_0 = {-9.0, -10.0, -15.0, -16.0, + -1.0, -6.0, -3.0, -8.0, + 5.0, 2.0, 7.0, 4.0, + 13.0, 14.0, 11.0, 12.0}; + auto output_0 = _Sort(input_nhwc, 0); + auto gotOutput_0 = output_0->readMap(); + if (!checkVector(gotOutput_0, expectedOutput_0.data(), 16, 0)) { + MNN_ERROR("SortTest test axis_0 failed!\n"); + return false; + } + const std::vector expectedOutput_1 = {-3.0, -1.0, 2.0, 4.0, + -8.0, -6.0, 5.0, 7.0, + -10.0, -9.0, 11.0, 12.0, + -16.0, -15.0, 13.0, 14.0}; + auto output_1 = _Sort(input_nhwc, 1); + auto gotOutput_1 = output_1->readMap(); + if (!checkVector(gotOutput_1, expectedOutput_1.data(), 16, 0)) { + MNN_ERROR("SortTest test axis_1 failed!\n"); + return false; + } + const std::vector expectedOutput_2 = { 2, 2, 3, 3, + 0, 1, 0, 1, + 1, 0, 1, 0, + 3, 3, 2, 2 }; + auto output_2 = _Sort(_Clone(input_nhwc, true), 0, true); + auto gotOutput_2 = output_2->readMap(); + if (!checkVector(gotOutput_2, expectedOutput_2.data(), 16, 0)) { + MNN_ERROR("ArgSortTest test axis_0 failed!\n"); + return false; + } + const std::vector expectedOutput_3 = { 2, 0, 1, 3, + 3, 1, 0, 2, + 1, 0, 2, 3, + 3, 2, 0, 1 }; + auto output_3 = _Sort(_Clone(input_nhwc, true), 1, true); + auto gotOutput_3 = output_3->readMap(); + if (!checkVector(gotOutput_3, expectedOutput_3.data(), 16, 0)) { + MNN_ERROR("ArgSortTest test axis_1 failed!\n"); + return false; + } + const std::vector expectedOutput_4 = { 3, 3, 2, 2, + 1, 0, 1, 0, + 0, 1, 0, 1, + 2, 2, 3, 3 }; + auto output_4 = _Sort(_Clone(input_nhwc, true), 0, true, true); + auto gotOutput_4 = output_4->readMap(); + if (!checkVector(gotOutput_4, expectedOutput_4.data(), 16, 0)) { + MNN_ERROR("ArgSortTest test axis_0, descend failed!\n"); + return false; + } + auto input_nchw = _Input({5}, NC4HW4); + inputPtr = input_nchw->writeMap(); + const float inpudatax[] = { 0.4, 0.2, 0.5, 0.1, 0.3 }; + memcpy(inputPtr, inpudatax, 5 * sizeof(float)); + auto output_5 = _Sort(input_nchw, 0, true); + auto gotOutput_5 = output_5->readMap(); + const std::vector expectedOutput_5 = { 3, 1, 4, 0, 2 }; + if (!checkVector(gotOutput_5, expectedOutput_5.data(), 5, 0)) { + MNN_ERROR("ArgSortTest test axis_0 failed!\n"); + return false; + } + return true; + } +}; + +MNNTestSuiteRegister(SortTest, "op/sort"); diff --git a/test/op/StridedSliceTest.cpp b/test/op/StridedSliceTest.cpp index 4ba42ddf..80b4506c 100644 --- a/test/op/StridedSliceTest.cpp +++ b/test/op/StridedSliceTest.cpp @@ -123,6 +123,26 @@ public: MNN_ERROR("stridedslice dim = 3, stride=-1 test failed!\n"); return false; } +#ifdef MNN_STRIDESLICE_WRITE + // 9. write + const int begin_data9[] = {0, 0, 0, 0}; + memcpy(begin->writeMap(), begin_data9, 4 * sizeof(int)); + const int end_data9[] = {1, 2, 2, 3}; + memcpy(end->writeMap(), end_data9, 4 * sizeof(int)); + const int stride_data9[] = {1, 1, 1, 1}; + memcpy(strided->writeMap(), stride_data9, 4 * sizeof(int)); + auto write = _Input({3}, NCHW); + const float write_data[] = {9, 9, 9}; + memcpy(write->writeMap(), write_data, 3 * sizeof(float)); + auto output_9= _StridedSliceWrite(input, begin, end, strided, write, 0, 0, 0, 0, 0); + const std::vector expectedShape_9 = {1, 3, 2, 3}; + const std::vector expectedOutput_9 = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 6, 6, 6}; + if (!checkVector(output_9->getInfo()->dim.data(), expectedShape_9.data(), expectedShape_9.size(), 0) || + !checkVector(output_9->readMap(), expectedOutput_9.data(), expectedOutput_9.size(), 0.01)) { + MNN_ERROR("stridedslicewrite test failed!\n"); + return false; + } +#endif return true; } }; diff --git a/test/op/UnaryTest.cpp b/test/op/UnaryTest.cpp index 71c5f258..7c65d97e 100644 --- a/test/op/UnaryTest.cpp +++ b/test/op/UnaryTest.cpp @@ -61,7 +61,7 @@ class AbsTest : public UnaryTestCommon { public: virtual ~AbsTest() = default; virtual bool run(int precision) { - return test(_Abs, "AbsTest", 0.01, + return test(MNN::Express::_Abs, "AbsTest", 0.01, {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0}, {8}, {8}); } diff --git a/tools/converter/CMakeLists.txt b/tools/converter/CMakeLists.txt index 99091057..72765327 100644 --- a/tools/converter/CMakeLists.txt +++ b/tools/converter/CMakeLists.txt @@ -13,6 +13,7 @@ IF(MNN_BUILD_CONVERTER) set(Protobuf_INCLUDE_DIRS ${PROTOBUF_INCLUDE_DIRS}) endif() ENDIF() + SET(Protobuf_LIBRARIES ${Protobuf_LIBRARIES} PARENT_SCOPE) add_definitions(-DGOOGLE_PROTOBUF_NO_RTTI) include_directories(${CMAKE_CURRENT_LIST_DIR}/include/) include_directories(${CMAKE_CURRENT_LIST_DIR}/source/tflite/schema/) @@ -41,8 +42,17 @@ IF(MNN_BUILD_CONVERTER) ${CMAKE_CURRENT_LIST_DIR}/source/MNNConverter.cpp ) IF(MNN_BUILD_SHARED_LIBS) - add_library(MNNConvertDeps SHARED ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp $) - add_dependencies(MNNConvertDeps MNN) + IF(MNN_SEP_BUILD) + add_library(MNNConvertDeps SHARED ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp $) + add_dependencies(MNNConvertDeps MNN) + ELSE() + add_library(MNNConvertDeps OBJECT ${COMMON_SRC} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp) + target_compile_definitions(MNNConvertDeps PRIVATE BUILDING_MNN_DLL PROTOBUF_USE_DLLS INTERFACE USING_MNN_DLL) + FOREACH(TARGET ${MNN_CONVERTER_BACKENDS_TARGETS}) + target_compile_definitions(${TARGET} PRIVATE BUILDING_MNN_DLL PROTOBUF_USE_DLLS INTERFACE USING_MNN_DLL) + ENDFOREACH() + target_sources(MNN PRIVATE $ ${MNN_CONVERTER_BACKENDS_OBJECTS}) + ENDIF() ELSE() add_library(MNNConvertDeps STATIC ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp) ENDIF() @@ -68,6 +78,12 @@ IF(MNN_BUILD_CONVERTER) ELSE() target_link_libraries(MNNConvert MNNConvertDeps) endif() + ELSEIF(NOT MNN_SEP_BUILD) + add_executable(TestConvertResult ${CMAKE_CURRENT_LIST_DIR}/source/TestConvertResult.cpp) + target_link_libraries(TestConvertResult MNN) + add_executable(TestPassManager ${CMAKE_CURRENT_LIST_DIR}/source/TestPassManager.cpp) + target_link_libraries(TestPassManager MNN) + target_link_libraries(MNNConvert MNN) ELSE() target_link_libraries(MNNConvertDeps PUBLIC ${MNN_DEPS} ${Protobuf_LIBRARIES}) if (MNN_BUILD_TORCH) diff --git a/tools/converter/source/onnx/IfOnnx.cpp b/tools/converter/source/onnx/IfOnnx.cpp index 7abac834..4e785538 100644 --- a/tools/converter/source/onnx/IfOnnx.cpp +++ b/tools/converter/source/onnx/IfOnnx.cpp @@ -60,9 +60,17 @@ void IfOnnx::run(MNN::OpT* dstOp, const onnx::NodeProto* onnxNode, MNN_ERROR("Op(If) and its subgraphs (then_branch, else_branch) must have same output number\n"); return; } + for (int i = 0; i < onnxNode->output_size(); ++i) { + std::unique_ptr pair(new MNN::StringVecT); + pair->data.assign({thenOutputs[i], elseOutputs[i]}); + param->aliases_outputs.emplace_back(std::move(pair)); + } auto mergeInputs = thenInputs; - std::copy_if(elseInputs.begin(), elseInputs.end(), mergeInputs.end(), - [&](std::string& n) { return std::find(thenInputs.begin(), thenInputs.end(), n) == thenInputs.end(); }); + for (const auto& name : elseInputs) { + if (std::find(thenInputs.begin(), thenInputs.end(), name) == thenInputs.end()) { + mergeInputs.push_back(name); + } + } { // cond input std::unique_ptr pair(new MNN::StringVecT); param->aliases_inputs.emplace_back(std::move(pair)); diff --git a/tools/converter/source/onnx/LoopOnnx.cpp b/tools/converter/source/onnx/LoopOnnx.cpp index aa12485c..53adeb81 100644 --- a/tools/converter/source/onnx/LoopOnnx.cpp +++ b/tools/converter/source/onnx/LoopOnnx.cpp @@ -20,6 +20,10 @@ MNN::OpParameter LoopOnnx::type() { void LoopOnnx::run(MNN::OpT* dstOp, const onnx::NodeProto* onnxNode, OnnxScope* scope) { + if(onnxNode->input(0) == "" || onnxNode->input(1) == "") { + MNN_ERROR("Failed: Loop don't support optional M and cond input\n"); + return; + } auto param = new MNN::WhileParamT; dstOp->name += "/Loop"; param->body_graph = dstOp->name + "/body"; diff --git a/tools/converter/source/onnx/onnxConverter.cpp b/tools/converter/source/onnx/onnxConverter.cpp index a0407130..c21827d9 100644 --- a/tools/converter/source/onnx/onnxConverter.cpp +++ b/tools/converter/source/onnx/onnxConverter.cpp @@ -96,10 +96,12 @@ int onnx2MNNNet(const std::string inputModel, const std::string bizCode, int inputIdx = scope->lookupTensor(onnxNode.input(k)); if (inputIdx < 0) { LOG(INFO) << "Check it out ==> " << MNNOp->name << " has empty input, the index is " << k; - continue; } MNNOp->inputIndexes.push_back(inputIdx); } + for (int k = onnxNode.input_size() - 1; k >= 0 && MNNOp->inputIndexes[k] < 0; --k) { + MNNOp->inputIndexes.pop_back(); + } for (int k = 0; k < onnxNode.output_size(); k++) { MNNOp->outputIndexes.push_back(scope->declareTensor(onnxNode.output(k))); } diff --git a/tools/converter/source/optimizer/Program.cpp b/tools/converter/source/optimizer/Program.cpp index 39c2939d..61bfa4ed 100644 --- a/tools/converter/source/optimizer/Program.cpp +++ b/tools/converter/source/optimizer/Program.cpp @@ -34,6 +34,10 @@ void Program::createUnit(std::map& varMap, std::vector& inputInd } invalidSet.insert(op); for (auto input : op->inputIndexes) { + if (input < 0) { // optional input + inputVars.emplace_back(nullptr); + continue; + } if (varMap.find(input) == varMap.end()) { for (int j = 0; j < oplists.size(); ++j) { for (auto outputIndex : oplists[j]->outputIndexes) { diff --git a/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp b/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp index 09d3d4c0..074938ce 100644 --- a/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp +++ b/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp @@ -30,7 +30,7 @@ static auto gRegister = []() { if (inputExpr->get()->type() == OpType_Reshape) { inputExpr = inputExpr->inputs()[0]->expr().first; } - if (inputExpr->get()->main_type() != OpParameter_Convolution2D || inputExpr->outputs().size() != 1) { + if (!inputExpr->get() || inputExpr->get()->main_type() != OpParameter_Convolution2D || inputExpr->outputs().size() != 1) { return false; } if (inputExpr->inputs().size() > 1) { diff --git a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp index 4afca6f8..799fdf11 100644 --- a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp +++ b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp @@ -31,6 +31,15 @@ static VARP _ReshapeF(VARP x, VARP shape, MNN::MNN_DATA_FORMAT format) { reshape->main.AsReshape()->dimType = format; return (Variable::create(Expr::create(reshape.get(), {x, shape}))); } +static VARP _ConvertF(VARP input, MNN::MNN_DATA_FORMAT format) { + std::unique_ptr convert(new OpT); + convert->type = OpType_ConvertTensor; + convert->main.type = OpParameter_TensorConvertInfo; + convert->main.value = new TensorConvertInfoT; + convert->main.AsTensorConvertInfo()->source = MNN_DATA_FORMAT_NC4HW4; + convert->main.AsTensorConvertInfo()->dest = format; + return (Variable::create(Expr::create(convert.get(), {input}))); +} static bool checkInputInfo(const std::string& exprName, const Variable::Info* info, const modelConfig* config) { if (nullptr == info) { if (config->optimizeLevel < 1) { @@ -200,7 +209,6 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { dense->common->outputCount = num_output; std::unique_ptr dense_op(new OpT); - dense_op->name = expr->name(); dense_op->type = OpType_Convolution; dense_op->main.type = OpParameter_Convolution2D; dense_op->main.value = dense.release(); @@ -227,7 +235,10 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() { } EXPRP dense_expr = Expr::create(dense_op.get(), {input}, 1); VARP output = Variable::create(dense_expr); + //MNN_PRINT("%d\n", output->getInfo()->order); + output = _ConvertF(output, format); VARP reshapeVar = _ReshapeF(output, _Concat({inputRemain, inputE, outputH}, 0), format); + reshapeVar->setName(expr->outputName(0)); Expr::replace(expr, reshapeVar->expr().first); return true /*modified*/; diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.cpp b/tools/converter/source/optimizer/merge/MergeHelpers.cpp index 0a8a3b59..5f63666c 100644 --- a/tools/converter/source/optimizer/merge/MergeHelpers.cpp +++ b/tools/converter/source/optimizer/merge/MergeHelpers.cpp @@ -116,6 +116,9 @@ std::vector OutputVars(EXPRP expr) { continue; } for (VARP output : child->inputs()) { + if (output.get() == nullptr) { + continue; + } int output_index = 0; EXPRP parent; std::tie(parent, output_index) = output->expr(); diff --git a/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp b/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp index b7c8abb8..912d5fe8 100644 --- a/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp +++ b/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp @@ -174,7 +174,7 @@ static auto gRegister = []() { } auto inputs = expr->inputs(); for (auto input : inputs) { - if (input->expr().first->get() == nullptr) { + if (input.get() == nullptr || input->expr().first->get() == nullptr) { continue; } auto subOp = input->expr().first->get(); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp index d7258af0..b77597b0 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp @@ -20,7 +20,6 @@ public: auto extraParam = op->main_as_Extra(); float maxValue = std::numeric_limits().max(); float minValue = -std::numeric_limits().max(); - bool setReady = false; if (nullptr != extraParam->attr()) { const int attrSize = extraParam->attr()->size(); for (int i = 0; i < attrSize; ++i) { @@ -28,38 +27,37 @@ public: const auto& key = attr->key()->str(); if (key == "max") { maxValue = attr->f(); - setReady = true; } else if (key == "min") { minValue = attr->f(); - setReady = true; } } } - bool known_min_max = true; - if (inputs.size() == 2 && (!setReady)) { + bool unknown_min_max = false; + if (inputs.size() == 2 || (inputs.size() == 3 && inputs[1].get() != nullptr)) { auto minPtr = inputs[1]->readMap(); if (nullptr != minPtr) { minValue = minPtr[0]; } else { - known_min_max = false; + unknown_min_max = true; } } - if (inputs.size() >= 3 && (!setReady)) { - auto minPtr = inputs[1]->readMap(); - if (nullptr != minPtr) { - minValue = minPtr[0]; - } else { - known_min_max = false; - } + if (inputs.size() == 3 && !unknown_min_max) { auto maxPtr = inputs[2]->readMap(); if (nullptr != maxPtr) { maxValue = maxPtr[0]; } else { - known_min_max = false; + unknown_min_max = true; } } - if (!known_min_max) { - auto res = _Minimum(_Maximum(inputs[0], inputs[1]), inputs[2]); + if (unknown_min_max) { + auto minVar = _Scalar(minValue), maxVar = _Scalar(maxValue); + if (inputs.size() >= 2 && inputs[1].get() != nullptr) { + minVar = inputs[1]; + } + if (inputs.size() >= 3) { + maxVar = inputs[2]; + } + auto res = _Minimum(_Maximum(inputs[0], minVar), maxVar); auto newExpr = res->expr().first; newExpr->setName(expr->name()); return newExpr; diff --git a/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp index 9c46012b..00b6b656 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp @@ -16,7 +16,18 @@ class OnnxLSTMTransform : public OnnxExtraManager::Transform { public: virtual EXPRP onExecute(EXPRP expr) const override { auto inputs = expr->inputs(); - MNN_ASSERT(inputs.size() >= 4); + if (inputs.size() == 8) { + MNN_ERROR("MNN LSTM not support 8th input (peepholes)\n"); + return nullptr; + } + if (inputs.size() >= 5 && inputs[4].get() != nullptr) { + MNN_ERROR("MNN LSTM not support sequence_lens, all batch must be seq_length\n"); + return nullptr; + } + if (inputs.size() < 4 || inputs[3].get() == nullptr) { + MNN_ERROR("MNN LSTM not support optional 4th input (must provide B)\n"); + return nullptr; + } std::unique_ptr lstm(new OpT); lstm->name = expr->name(); if (expr->get()->main_as_Extra()->type()->str() == "RNN") { @@ -41,6 +52,9 @@ public: // onnx docs guarantee bias shape is [num_direction, 8 * hidden_size], we split it to 2x [num_dicection, 4 * hidden_size] (W/R), then add together auto biasWR = _Split(inputs[3], {2}, 1); inputs[3] = _Add(biasWR[0], biasWR[1]); + if (inputs.size() >= 5) { + inputs.erase(inputs.begin() + 4); // ignore sequence_lens + } // Y, Y_h, Y_c auto originLSTM = Expr::create(lstm.get(), inputs, (lstm->type == OpType_RNN ? 2 : 3)); originLSTM->setName(expr->name()); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp b/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp index 344198f8..91ce28d1 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp @@ -27,10 +27,21 @@ public: // onnx scores is 3D [num_batches, num_classes, boxes_num] with num_batches = 1, // while tf scores is 1D [boxes_num]. auto inputs = expr->inputs(); - // 3th input is max_output_boxes_per_class(default is 0), making output shape is (0, 3) which MNN isn't support - MNN_ASSERT(inputs.size() >= 3); + // optional input 3/4/5th + if (inputs.size() < 3 || inputs[2].get() == nullptr) { + MNN_ERROR("NonMaxSuppression's max_output_boxes_per_class must be provided (can't optional)\n"); + return nullptr; + } + auto zero = _Scalar(0); + for (int i = 3; i < inputs.size(); ++i) { + if (inputs[i].get() == nullptr) { + inputs[i] = zero; + } + } + auto input0Info = inputs[0]->getInfo(); auto input1Info = inputs[1]->getInfo(); + if (nullptr == input0Info || nullptr == input1Info) { MNN_ERROR("Shape of NonMaxSupression's input is unknown. Please confirm version of MNN engine is new enough and use V3 Module API to run it correctly\n"); std::unique_ptr nms(new OpT); diff --git a/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp index 3e7fc7c7..8402012e 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp @@ -23,7 +23,10 @@ class OnnxSequenceGRUTransform : public OnnxExtraManager::Transform { public: virtual EXPRP onExecute(EXPRP expr) const override { auto inputs = expr->inputs(); - MNN_ASSERT(inputs.size() >= 4); // X W R B + if (inputs.size() < 4 || inputs[3].get() == nullptr) { // X W R B + MNN_ERROR("Don't support optional 4th input (B)\n"); + return nullptr; + } auto rnnGRUParam = new MNN::RNNParamT; std::unique_ptr gru(new OpT); gru->name = expr->name(); @@ -108,8 +111,12 @@ public: } // auto sequence_lens = inputs[4]; sequence_lens is ommitted at onnxConverter.cpp - if (inputs.size() > 4) { // initial_h exist, shape is [num_directions, batch_size, hidden_size] - gruInput.push_back(inputs[4]); + if (inputs.size() > 4 && inputs[4].get() != nullptr) { + MNN_ERROR("Don't support sequence_lens input, all batch have seq_length\n"); + return nullptr; + } + if (inputs.size() > 5) { // initial_h exist, shape is [num_directions, batch_size, hidden_size] + gruInput.push_back(inputs[5]); } auto gruExpr = Expr::create(gru.get(), gruInput, expr->outputSize()); diff --git a/tools/converter/source/optimizer/passes/Pass.hpp b/tools/converter/source/optimizer/passes/Pass.hpp index a36a1433..8349fbdd 100644 --- a/tools/converter/source/optimizer/passes/Pass.hpp +++ b/tools/converter/source/optimizer/passes/Pass.hpp @@ -61,6 +61,7 @@ public: PassManager() = delete; PassManager(PassContext *context) : context_(context) {} PassManager(const PassManager& other); + PassManager& operator=(const PassManager&) = delete; virtual ~PassManager() = default; diff --git a/tools/converter/source/optimizer/passes/PassRegistry.cpp b/tools/converter/source/optimizer/passes/PassRegistry.cpp index 6c3fbe0d..62f6b863 100644 --- a/tools/converter/source/optimizer/passes/PassRegistry.cpp +++ b/tools/converter/source/optimizer/passes/PassRegistry.cpp @@ -8,7 +8,6 @@ #include #include -#include #include "MNN/MNNDefine.h" #include "converter/source/optimizer/passes/PassRegistry.hpp" @@ -29,10 +28,7 @@ static std::vector>* AllRegisteredPassManagers() { return &g_registered_pass_managers; } -static std::mutex g_mutex; - /*static*/ PassManager* PassManagerRegistry::GetPassManager(int index) { - std::lock_guard lock(g_mutex); auto* g_registered_pass_managers = AllRegisteredPassManagers(); MNN_CHECK(index < g_registered_pass_managers->size(), "The pass manager index is out of bounds."); @@ -40,7 +36,6 @@ static std::mutex g_mutex; } /*static*/ std::vector PassManagerRegistry::GetAllPassManagers() { - std::lock_guard lock(g_mutex); std::vector pass_managers; for (auto& pm : *(AllRegisteredPassManagers())) { pass_managers.push_back(pm.get()); @@ -49,19 +44,16 @@ static std::mutex g_mutex; } /*static*/ void PassManagerRegistry::AddPassManager(const PassManager& pm) { - std::lock_guard lock(g_mutex); auto* g_registered_pass_managers = AllRegisteredPassManagers(); g_registered_pass_managers->emplace_back(new PassManager(pm)); } /*static*/ void PassRegistry::AddPass(std::unique_ptr&& pass) { - std::lock_guard lock(g_mutex); auto* g_registered_passes = AllRegisteredPasses(); g_registered_passes->emplace(pass->name(), std::move(pass)); } /*static*/ Pass* PassRegistry::GetPass(const std::string& pass_name) { - std::lock_guard lock(g_mutex); auto* g_registered_passes = AllRegisteredPasses(); const auto& it = g_registered_passes->find(pass_name); if (it != g_registered_passes->end()) { diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp index f5fc458a..684abec9 100644 --- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp +++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp @@ -426,6 +426,9 @@ public: auto currentName = op->name; for (int i = 0; i < op->inputIndexes.size(); ++i) { auto inputIndex = op->inputIndexes[i]; + if (inputIndex < 0) { + continue; // optional input, ignore it + } auto type = tensorFormats[inputIndex]; auto requireType = _getRequireFormat(formatType, i, tensorFormats[op->outputIndexes[0]], originTensorType); if (type == requireType) { diff --git a/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp b/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp index 9ed887a7..72f51710 100644 --- a/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp +++ b/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp @@ -21,6 +21,9 @@ public: std::vector tensorValid(mNet->tensorName.size(), false); for (auto& op : mNet->oplists) { for (auto index : op->inputIndexes) { + if (index < 0) { + continue; // optional input, ignore it + } tensorValid[index] = true; } for (auto index : op->outputIndexes) { @@ -38,6 +41,9 @@ public: // Re index for (auto& op : mNet->oplists) { for (int i = 0; i < op->inputIndexes.size(); ++i) { + if (op->inputIndexes[i] < 0) { + continue; + } auto iter = usefulTensorIndexMap.find(op->inputIndexes[i]); DCHECK(iter != usefulTensorIndexMap.end()) << "ERROR"; op->inputIndexes[i] = iter->second; diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp index b5f0c531..3c149c50 100644 --- a/tools/cpp/MNNV2Basic.cpp +++ b/tools/cpp/MNNV2Basic.cpp @@ -397,6 +397,8 @@ static int test_main(int argc, const char* argv[]) { auto outputFile = pwd + "output.txt"; if (outputTensor->size() > 0) { dumpTensor2File(&expectTensor, outputFile.c_str(), orderFileOs); + } else { + MNN_ERROR("output size is 0, can't save\n"); } } auto allOutputs = net->getSessionOutputAll(session); diff --git a/tools/cpp/backendTest.cpp b/tools/cpp/backendTest.cpp index bf022944..0a3b8ce0 100644 --- a/tools/cpp/backendTest.cpp +++ b/tools/cpp/backendTest.cpp @@ -70,7 +70,8 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) { return true; } - std::shared_ptr copyTensor(MNN::Tensor::createHostTensorFromDevice(tensor, true)); + std::shared_ptr copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType())); + tensor->copyToHostTensor(copyTensor.get()); correctResult.emplace_back(copyTensor); } return true; @@ -90,7 +91,8 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) { return true; } - std::shared_ptr copyTensor(MNN::Tensor::createHostTensorFromDevice(tensor, true)); + std::shared_ptr copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType())); + tensor->copyToHostTensor(copyTensor.get()); auto expectTensor = correctResult[index++]; auto correct = TensorUtils::compareTensors(copyTensor.get(), expectTensor.get(), tolerance, true); if (!correct) { diff --git a/tools/cpp/testModelWithDescrisbe.cpp b/tools/cpp/testModelWithDescrisbe.cpp index 5d25ed7d..e5b0149a 100644 --- a/tools/cpp/testModelWithDescrisbe.cpp +++ b/tools/cpp/testModelWithDescrisbe.cpp @@ -116,7 +116,13 @@ bool compareVar(VARP var, std::string name) { auto diffAbsMax = _ReduceMax(diff); auto absMaxV = absMax->readMap()[0]; auto diffAbsMaxV = diffAbsMax->readMap()[0]; - if (absMaxV * 0.01f < diffAbsMaxV || std::isnan(absMaxV)) { + // The implemention of isnan in VS2017 isn't accept integer type, so cast all type to double +#ifdef _MSC_VER +#define ALI_ISNAN(x) std::isnan(static_cast(x)) +#else +#define ALI_ISNAN(x) std::isnan(x) +#endif + if (absMaxV * 0.01f < diffAbsMaxV || ALI_ISNAN(absMaxV)) { std::cout << "TESTERROR " << name << " value error : absMaxV:" << absMaxV << " - DiffMax:" << diffAbsMaxV << std::endl; return false; } diff --git a/tools/cv/CMakeLists.txt b/tools/cv/CMakeLists.txt index a067b928..2275cdc1 100644 --- a/tools/cv/CMakeLists.txt +++ b/tools/cv/CMakeLists.txt @@ -16,11 +16,15 @@ IF(MNN_BUILD_OPENCV) file(GLOB_RECURSE IMGCODECS_SRC ${CMAKE_CURRENT_LIST_DIR}/source/imgcodecs/*.cpp ${CMAKE_CURRENT_LIST_DIR}/include/cv/imgcodecs/*.hpp) endif() - IF(MNN_BUILD_SHARED_LIBS) - add_library(MNNOpenCV SHARED ${IMGPROC_SRC} ${IMGCODECS_SRC}) - target_link_libraries(MNNOpenCV MNN MNN_Express) + IF(MNN_SEP_BUILD) + IF(MNN_BUILD_SHARED_LIBS) + add_library(MNNOpenCV SHARED ${IMGPROC_SRC} ${IMGCODECS_SRC}) + target_link_libraries(MNNOpenCV MNN MNN_Express) + ELSE() + add_library(MNNOpenCV STATIC ${IMGPROC_SRC} ${IMGCODECS_SRC}) + ENDIF() ELSE() - add_library(MNNOpenCV STATIC ${IMGPROC_SRC} ${IMGCODECS_SRC}) + add_library(MNNOpenCV OBJECT ${IMGPROC_SRC} ${IMGCODECS_SRC}) ENDIF() IF(CMAKE_SYSTEM_NAME MATCHES "^Android" AND NOT MNN_BUILD_FOR_ANDROID_COMMAND) IF(NOT NATIVE_INCLUDE_OUTPUT) diff --git a/tools/cv/include/cv/imgproc/draw.hpp b/tools/cv/include/cv/imgproc/draw.hpp index 9e28cf22..d5333482 100644 --- a/tools/cv/include/cv/imgproc/draw.hpp +++ b/tools/cv/include/cv/imgproc/draw.hpp @@ -26,7 +26,8 @@ enum LineTypes { MNN_PUBLIC void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color, int thickness=1, int line_type=8, int shift=0, double tipLength=0.1); -MNN_PUBLIC void circle(); +MNN_PUBLIC void circle(VARP& img, Point center, int radius, const Scalar& color, + int thickness=1, int line_type=8, int shift=0); MNN_PUBLIC void line(VARP& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1, int lineType = LINE_8, int shift = 0); @@ -34,6 +35,11 @@ MNN_PUBLIC void line(VARP& img, Point pt1, Point pt2, const Scalar& color, MNN_PUBLIC void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1, int lineType = LINE_8, int shift = 0); +MNN_PUBLIC void drawContours(VARP& img, std::vector> _contours, int contourIdx, const Scalar& color, + int thickness = 1, int lineType = LINE_8); + +MNN_PUBLIC void fillPoly(VARP& img, std::vector> pts, const Scalar& color, + int line_type = LINE_8, int shift = 0, Point offset = {0, 0}); } // CV } // MNN #endif // DRAW_HPP diff --git a/tools/cv/include/cv/imgproc/geometric.hpp b/tools/cv/include/cv/imgproc/geometric.hpp index 52d6135c..26840124 100644 --- a/tools/cv/include/cv/imgproc/geometric.hpp +++ b/tools/cv/include/cv/imgproc/geometric.hpp @@ -56,11 +56,12 @@ MNN_PUBLIC Matrix getRotationMatrix2D(Point center, double angle, double scale); MNN_PUBLIC Matrix invertAffineTransform(Matrix M); MNN_PUBLIC VARP resize(VARP src, Size dsize, double fx = 0, double fy = 0, - int interpolation = INTER_LINEAR); + int interpolation = INTER_LINEAR, int code = -1, + std::vector mean = {}, std::vector norm = {}); MNN_PUBLIC VARP warpAffine(VARP src, Matrix M, Size dsize, - int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, - int borderValue = 0); + int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, int borderValue = 0, + int code = -1, std::vector mean = {}, std::vector norm = {}); MNN_PUBLIC VARP warpPerspective(VARP src, Matrix M, Size dsize, int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, diff --git a/tools/cv/include/cv/imgproc/structural.hpp b/tools/cv/include/cv/imgproc/structural.hpp index 482717e1..faf4ff7a 100644 --- a/tools/cv/include/cv/imgproc/structural.hpp +++ b/tools/cv/include/cv/imgproc/structural.hpp @@ -44,13 +44,13 @@ public: }; typedef std::vector POINTS; -MNN_PUBLIC std::vector findContours(VARP image, int mode, int method, Point offset = {0, 0}); -MNN_PUBLIC double contourArea(POINTS _contour, bool oriented = false); -MNN_PUBLIC std::vector convexHull(POINTS _points, bool clockwise = false, bool returnPoints = true); -MNN_PUBLIC RotatedRect minAreaRect(POINTS _points); -MNN_PUBLIC Rect2i boundingRect(POINTS points); +MNN_PUBLIC std::vector findContours(VARP image, int mode, int method, Point offset = {0, 0}); +MNN_PUBLIC double contourArea(VARP _contour, bool oriented = false); +MNN_PUBLIC std::vector convexHull(VARP _points, bool clockwise = false, bool returnPoints = true); +MNN_PUBLIC RotatedRect minAreaRect(VARP _points); +MNN_PUBLIC Rect2i boundingRect(VARP points); MNN_PUBLIC int connectedComponentsWithStats(VARP image, VARP& labels, VARP& statsv, VARP& centroids, int connectivity = 8); -MNN_PUBLIC POINTS boxPoints(RotatedRect box); +MNN_PUBLIC VARP boxPoints(RotatedRect box); } // CV } // MNN #endif // STRUCTURAL_HPP diff --git a/tools/cv/include/cv/types.hpp b/tools/cv/include/cv/types.hpp index ee585f25..9cc4aee7 100644 --- a/tools/cv/include/cv/types.hpp +++ b/tools/cv/include/cv/types.hpp @@ -18,7 +18,7 @@ using namespace Express; #define MNN_PI 3.1415926535897932384626433832795 -typedef char schar; +typedef signed char schar; typedef unsigned char uchar; // Size Start @@ -194,6 +194,7 @@ public: Point_& operator = (const Point_& pt); Point_& operator = (Point_&& pt); + template operator Point_<_Tp2>() const; _Tp x; //!< x coordinate of the point _Tp y; //!< y coordinate of the point @@ -237,6 +238,32 @@ Point_<_Tp>& Point_<_Tp>::operator = (Point_&& pt) x = std::move(pt.x); y = std::move(pt.y); return *this; } + +template template inline +Point_<_Tp>::operator Point_<_Tp2>() const +{ + return Point_<_Tp2>(static_cast<_Tp2>(x), static_cast<_Tp2>(y)); +} + +template static inline +Point_<_Tp>& operator += (Point_<_Tp>& a, const Point_<_Tp>& b) +{ + a.x += b.x; + a.y += b.y; + return a; +} + +template static inline +Point_<_Tp> operator - (const Point_<_Tp>& a, const Point_<_Tp>& b) +{ + return Point_<_Tp>( static_cast<_Tp>(a.x - b.x), static_cast<_Tp>(a.y - b.y) ); +} + +template static inline +bool operator != (const Point_<_Tp>& a, const Point_<_Tp>& b) +{ + return a.x != b.x || a.y != b.y; +} // Point End // Rect Start template class Rect_ @@ -361,11 +388,21 @@ template class Scalar_ { public: //! default constructor Scalar_(); - Scalar_(_Tp _r, _Tp _g, _Tp _b) : r(_r), g(_g), b(_b), a(255) {}; - Scalar_(_Tp _r, _Tp _g, _Tp _b, _Tp _a) : r(_r), g(_g), b(_b), a(_a) {}; - _Tp r, g, b, a; + Scalar_(_Tp _r, _Tp _g, _Tp _b) { + val[0] = _r; + val[1] = _g; + val[2] = _b; + val[3] = 255; + }; + Scalar_(_Tp _r, _Tp _g, _Tp _b, _Tp _a) { + val[0] = _r; + val[1] = _g; + val[2] = _b; + val[3] = _a; + }; + _Tp val[4]; }; -typedef Scalar_ Scalar; +typedef Scalar_ Scalar; // Scalar End static void getVARPSize(VARP var, int* height, int* width, int* channel) { @@ -406,6 +443,9 @@ static int getVARPChannel(VARP var) { getVARPSize(var, &h, &w, &c); return c; } +static int getVARPByte(VARP var) { + return var->getInfo()->type.bytes(); +} } // CV } // MNN #endif // TYPES_HPP diff --git a/tools/cv/source/imgcodecs/imgcodecs.cpp b/tools/cv/source/imgcodecs/imgcodecs.cpp index 03b23f71..80f50316 100644 --- a/tools/cv/source/imgcodecs/imgcodecs.cpp +++ b/tools/cv/source/imgcodecs/imgcodecs.cpp @@ -17,6 +17,8 @@ #define STBI_ONLY_JPEG #define STBI_ONLY_PNG #define STBI_ONLY_BMP +#define STB_IMAGE_STATIC + #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION #include "stb_image_write.h" @@ -117,9 +119,12 @@ VARP imread(const std::string& filename, int flags) { } bool imwrite(const std::string& filename, VARP img, const std::vector& params) { - VARP rgb = cvtColor(img, COLOR_BGR2RGB); int height, width, channel; - getVARPSize(rgb, &height, &width, &channel); + getVARPSize(img, &height, &width, &channel); + if (channel == 3) { + img = cvtColor(img, COLOR_BGR2RGB); + } + auto ext = getExt(filename); if (ext == "jpg" || ext == "jpeg") { int quality = 95; @@ -129,13 +134,13 @@ bool imwrite(const std::string& filename, VARP img, const std::vector& para break; } } - return stbi_write_jpg(filename.c_str(), width, height, channel, rgb->readMap(), quality); + return stbi_write_jpg(filename.c_str(), width, height, channel, img->readMap(), quality); } if (ext == ".png") { - return stbi_write_png(filename.c_str(), width, height, channel, rgb->readMap(), 0); + return stbi_write_png(filename.c_str(), width, height, channel, img->readMap(), 0); } if (ext == ".bmp") { - return stbi_write_bmp(filename.c_str(), width, height, channel, rgb->readMap()); + return stbi_write_bmp(filename.c_str(), width, height, channel, img->readMap()); } return false; } diff --git a/tools/cv/source/imgproc/color.cpp b/tools/cv/source/imgproc/color.cpp index 2419cf77..3c64dd61 100644 --- a/tools/cv/source/imgproc/color.cpp +++ b/tools/cv/source/imgproc/color.cpp @@ -13,7 +13,7 @@ namespace MNN { namespace CV { -static std::pair getSrcDstFormat(int code) { +std::pair getSrcDstFormat(int code) { switch (code) { #define CONVERT_SUFFIX(src, dst, suffix) \ case COLOR_##src##2##dst##_##suffix: \ @@ -75,7 +75,8 @@ static std::pair getSrcDstFormat(int code) { } return {CV::RGB, CV::RGB}; } -static int format2Channel(CV::ImageFormat format) { + +int format2Channel(CV::ImageFormat format) { switch (format) { case CV::RGB: case CV::BGR: diff --git a/tools/cv/source/imgproc/draw.cpp b/tools/cv/source/imgproc/draw.cpp index 9097b39b..447492e5 100644 --- a/tools/cv/source/imgproc/draw.cpp +++ b/tools/cv/source/imgproc/draw.cpp @@ -9,53 +9,831 @@ #include #include "cv/imgproc/draw.hpp" #include +#include #include namespace MNN { namespace CV { // help functions -// TODO: replace this function with an Op. -void bresenham(uint8_t* ptr, int h, int w, int c, int x1, int y1, int x2, int y2, Scalar color) { - int x = x1; - int y = y1; - int dx = abs(x2 - x1); - int dy = abs(y2 - y1); - int s1 = x2 > x1 ? 1 : -1; - int s2 = y2 > y1 ? 1 : -1; - bool interchange = false; - if (dy > dx) { - std::swap(dx, dy); - interchange = true; - } - int p = 2 * dy - dx; - for(int i = 0; i <= dx; i++) { - // printf("[%d, %d]\n", x, y); - memcpy(ptr + (y * w + x) * c, &color, c); - if (p >= 0) { - if (interchange) { - x += s1; - } else { - y += s2; +#define MIN(a,b) ((a) > (b) ? (b) : (a)) +#define MAX(a,b) ((a) < (b) ? (b) : (a)) + +struct Region { +public: + Region(int _y, int _xl, int _xr) : y(_y), xl(_xl), xr(_xr) {} + Region(int _y, int _xl) : y(_y), xl(_xl), xr(_xl) {} + int y; + int xl; + int xr; +}; + +bool clipLine(Size2l img_size, Point2l& pt1, Point2l& pt2) { + int c1, c2; + int64_t right = img_size.width-1, bottom = img_size.height-1; + if (img_size.width <= 0 || img_size.height <= 0) return false; + + int64_t &x1 = pt1.x, &y1 = pt1.y, &x2 = pt2.x, &y2 = pt2.y; + c1 = (x1 < 0) + (x1 > right) * 2 + (y1 < 0) * 4 + (y1 > bottom) * 8; + c2 = (x2 < 0) + (x2 > right) * 2 + (y2 < 0) * 4 + (y2 > bottom) * 8; + + if ((c1 & c2) == 0 && (c1 | c2) != 0) { + int64_t a; + if (c1 & 12) { + a = c1 < 8 ? 0 : bottom; + x1 += (int64_t)((double)(a - y1) * (x2 - x1) / (y2 - y1)); + y1 = a; + c1 = (x1 < 0) + (x1 > right) * 2; + } + if (c2 & 12) { + a = c2 < 8 ? 0 : bottom; + x2 += (int64_t)((double)(a - y2) * (x2 - x1) / (y2 - y1)); + y2 = a; + c2 = (x2 < 0) + (x2 > right) * 2; + } + if ((c1 & c2) == 0 && (c1 | c2) != 0) { + if (c1) { + a = c1 == 1 ? 0 : right; + y1 += (int64_t)((double)(a - x1) * (y2 - y1) / (x2 - x1)); + x1 = a; + c1 = 0; + } + if (c2) { + a = c2 == 1 ? 0 : right; + y2 += (int64_t)((double)(a - x2) * (y2 - y1) / (x2 - x1)); + x2 = a; + c2 = 0; } - p -= 2 * dx; } - if (interchange) { - y += s2; - } else { - x += s1; + MNN_ASSERT((c1 & c2) != 0 || (x1 | y1 | x2 | y2) >= 0); + } + return (c1 | c2) == 0; +} +bool clipLine(Size img_size, Point2i& pt1, Point2i& pt2) { + Point2l p1(pt1.x, pt1.y); + Point2l p2(pt2.x, pt2.y); + bool inside = clipLine(Size2l(img_size.width, img_size.height), p1, p2); + pt1.x = (int)p1.x; + pt1.y = (int)p1.y; + pt2.x = (int)p2.x; + pt2.y = (int)p2.y; + return inside; +} + +enum { XY_SHIFT = 16, XY_ONE = 1 << XY_SHIFT, DRAWING_STORAGE_BLOCK = (1<<12) - 256 }; +static void Line(std::vector& regions, Size size, Point2i pt1_, Point2i pt2_, int connectivity = 8) { + if (connectivity == 0) { + connectivity = 8; + } else if (connectivity == 1) { + connectivity = 4; + } + int count = -1, err, minusDelta, plusDelta, minusStep, plusStep, minusShift, plusShift; + Point2i p = Point2i(0, 0); + Rect2i rect(0, 0, size.width, size.height); + Point2i pt1 = pt1_ - rect.tl(); + Point2i pt2 = pt2_ - rect.tl(); + + if ((unsigned)pt1.x >= (unsigned)(rect.width) || (unsigned)pt2.x >= (unsigned)(rect.width) || + (unsigned)pt1.y >= (unsigned)(rect.height) || (unsigned)pt2.y >= (unsigned)(rect.height)) { + if (!clipLine(Size(rect.width, rect.height), pt1, pt2)) { + err = plusDelta = minusDelta = plusStep = minusStep = plusShift = minusShift = count = 0; } - p += 2 * dy; + } + + pt1 += rect.tl(); + pt2 += rect.tl(); + + int delta_x = 1, delta_y = 1; + int dx = pt2.x - pt1.x; + int dy = pt2.y - pt1.y; + + if (dx < 0) { + dx = -dx; + dy = -dy; + pt1 = pt2; + } + + if (dy < 0) { + dy = -dy; + delta_y = -1; + } + + bool vert = dy > dx; + if (vert) { + std::swap(dx, dy); + std::swap(delta_x, delta_y); + } + + MNN_ASSERT(dx >= 0 && dy >= 0); + + if (connectivity == 8) { + err = dx - (dy + dy); + plusDelta = dx + dx; + minusDelta = -(dy + dy); + minusShift = delta_x; + plusShift = 0; + minusStep = 0; + plusStep = delta_y; + count = dx + 1; + } else /* connectivity == 4 */ { + err = 0; + plusDelta = (dx + dx) + (dy + dy); + minusDelta = -(dy + dy); + minusShift = delta_x; + plusShift = -delta_x; + minusStep = 0; + plusStep = delta_y; + count = dx + dy + 1; + } + + if (vert) { + std::swap(plusStep, plusShift); + std::swap(minusStep, minusShift); + } + p = pt1; + regions.emplace_back(Region{p.y, p.x}); + for(int i = 1; i < count; i++) { + int mask = err < 0 ? -1 : 0; + err += minusDelta + (plusDelta & mask); + p.y += minusStep + (plusStep & mask); + p.x += minusShift + (plusShift & mask); + regions.emplace_back(Region{p.y, p.x}); } } -std::vector getPoints(Point pt1, Point pt2, int thickness) { - int x1 = pt1.fX, y1 = pt1.fY, x2 = pt2.fX, y2 = pt2.fY; - std::vector pts { x1, y1, x2, y2 }; - for (int i = 0; i < thickness; i++) { - // x - i; +static void Line2(std::vector& regions, Size size, Point2l pt1, Point2l pt2) { + int64_t dx, dy; + int ecount; + int64_t ax, ay; + int64_t i, j; + int x, y; + int64_t x_step, y_step; + Size2l sizeScaled(((int64_t)size.width) << XY_SHIFT, ((int64_t)size.height) << XY_SHIFT); + if(!clipLine(sizeScaled, pt1, pt2)) { + return; } - return pts; + dx = pt2.x - pt1.x; + dy = pt2.y - pt1.y; + j = dx < 0 ? -1 : 0; + ax = (dx ^ j) - j; + i = dy < 0 ? -1 : 0; + ay = (dy ^ i) - i; + + if (ax > ay) { + dy = (dy ^ j) - j; + pt1.x ^= pt2.x & j; + pt2.x ^= pt1.x & j; + pt1.x ^= pt2.x & j; + pt1.y ^= pt2.y & j; + pt2.y ^= pt1.y & j; + pt1.y ^= pt2.y & j; + + x_step = XY_ONE; + y_step = (dy << XY_SHIFT) / (ax | 1); + ecount = (int)((pt2.x - pt1.x) >> XY_SHIFT); + } else { + dx = (dx ^ i) - i; + pt1.x ^= pt2.x & i; + pt2.x ^= pt1.x & i; + pt1.x ^= pt2.x & i; + pt1.y ^= pt2.y & i; + pt2.y ^= pt1.y & i; + pt1.y ^= pt2.y & i; + + x_step = (dx << XY_SHIFT) / (ay | 1); + y_step = XY_ONE; + ecount = (int)((pt2.y - pt1.y) >> XY_SHIFT); + } + pt1.x += (XY_ONE >> 1); + pt1.y += (XY_ONE >> 1); + regions.emplace_back(Region{(int)((pt2.y + (XY_ONE >> 1)) >> XY_SHIFT), (int)((pt2.x + (XY_ONE >> 1)) >> XY_SHIFT)}); + if (ax > ay) { + pt1.x >>= XY_SHIFT; + while(ecount >= 0) { + regions.emplace_back(Region{(int)(pt1.y >> XY_SHIFT), (int)(pt1.x)}); + pt1.x++; + pt1.y += y_step; + ecount--; + } + } else { + pt1.y >>= XY_SHIFT; + while(ecount >= 0) { + regions.emplace_back(Region{(int)(pt1.y), (int)(pt1.x >> XY_SHIFT)}); + pt1.x += x_step; + pt1.y++; + ecount--; + } + } +} + +static void FillConvexPoly(std::vector& regions, Size size, const Point2l* v, int npts, int line_type, int shift) { + struct { + int idx, di; + int64_t x, dx; + int ye; + } edge[2]; + + int delta = 1 << shift >> 1; + int i, y, imin = 0; + int edges = npts; + int64_t xmin, xmax, ymin, ymax; + Point2l p0; + int delta1, delta2; + + delta1 = delta2 = XY_ONE >> 1; + + p0 = v[npts - 1]; + p0.x <<= XY_SHIFT - shift; + p0.y <<= XY_SHIFT - shift; + + MNN_ASSERT(0 <= shift && shift <= XY_SHIFT); + xmin = xmax = v[0].x; + ymin = ymax = v[0].y; + + for (i = 0; i < npts; i++) { + Point2l p = v[i]; + if (p.y < ymin) { + ymin = p.y; + imin = i; + } + + ymax = std::max(ymax, p.y); + xmax = std::max(xmax, p.x); + xmin = MIN(xmin, p.x); + + p.x <<= XY_SHIFT - shift; + p.y <<= XY_SHIFT - shift; + + if(!shift) { + Point2i pt0, pt1; + pt0.x = (int)(p0.x >> XY_SHIFT); + pt0.y = (int)(p0.y >> XY_SHIFT); + pt1.x = (int)(p.x >> XY_SHIFT); + pt1.y = (int)(p.y >> XY_SHIFT); + Line(regions, size, pt0, pt1, line_type); + } else { + Line2(regions, size, p0, p); + } + p0 = p; + } + + xmin = (xmin + delta) >> shift; + xmax = (xmax + delta) >> shift; + ymin = (ymin + delta) >> shift; + ymax = (ymax + delta) >> shift; + + if(npts < 3 || (int)xmax < 0 || (int)ymax < 0 || (int)xmin >= size.width || (int)ymin >= size.height) { + return; + } + ymax = MIN(ymax, size.height - 1); + edge[0].idx = edge[1].idx = imin; + edge[0].ye = edge[1].ye = y = (int)ymin; + edge[0].di = 1; + edge[1].di = npts - 1; + edge[0].x = edge[1].x = -XY_ONE; + edge[0].dx = edge[1].dx = 0; + int region_y = y; + do { + if (y < (int)ymax || y == (int)ymin) { + for (i = 0; i < 2; i++) { + if (y >= edge[i].ye) { + int idx0 = edge[i].idx, di = edge[i].di; + int idx = idx0 + di; + if (idx >= npts) idx -= npts; + int ty = 0; + + for (; edges-- > 0; ) { + ty = (int)((v[idx].y + delta) >> shift); + if (ty > y) { + int64_t xs = v[idx0].x; + int64_t xe = v[idx].x; + if (shift != XY_SHIFT) + { + xs <<= XY_SHIFT - shift; + xe <<= XY_SHIFT - shift; + } + + edge[i].ye = ty; + edge[i].dx = ((xe - xs)*2 + (ty - y)) / (2 * (ty - y)); + edge[i].x = xs; + edge[i].idx = idx; + break; + } + idx0 = idx; + idx += di; + if (idx >= npts) idx -= npts; + } + } + } + } + + if (edges < 0) + break; + + if (y >= 0) { + int left = 0, right = 1; + if (edge[0].x > edge[1].x) + { + left = 1, right = 0; + } + + int xx1 = (int)((edge[left].x + delta1) >> XY_SHIFT); + int xx2 = (int)((edge[right].x + delta2) >> XY_SHIFT); + + if(xx2 >= 0 && xx1 < size.width) + { + if(xx1 < 0) { + xx1 = 0; + } + if(xx2 >= size.width) { + xx2 = size.width - 1; + } + if (xx2 - xx1 > 0) regions.emplace_back(Region{region_y, xx1, xx2}); + } + } + + edge[0].x += edge[0].dx; + edge[1].x += edge[1].dx; + region_y++; + } while(++y <= (int)ymax); +} + +static void sincos(int angle, float& cosval, float& sinval) { + angle += (angle < 0 ? 360 : 0); + float radian = angle * MNN_PI / 180; + sinval = sin(radian); + cosval = cos(radian); +} + +void ellipse2Poly(Point2d center, Size2d axes, int angle, int arc_start, int arc_end, int delta, std::vector& pts) { + MNN_ASSERT(0 < delta && delta <= 180); + + float alpha, beta; + int i; + + while(angle < 0) angle += 360; + while(angle > 360) angle -= 360; + + if (arc_start > arc_end) { + i = arc_start; + arc_start = arc_end; + arc_end = i; + } + while (arc_start < 0) { + arc_start += 360; + arc_end += 360; + } + while (arc_end > 360) { + arc_end -= 360; + arc_start -= 360; + } + if (arc_end - arc_start > 360) { + arc_start = 0; + arc_end = 360; + } + sincos(angle, alpha, beta); + pts.resize(0); + + for (i = arc_start; i < arc_end + delta; i += delta) { + double x, y; + angle = i; + if (angle > arc_end) angle = arc_end; + float sinv, cosv; + sincos(angle, sinv, cosv); + x = axes.width * cosv; + y = axes.height * sinv; + Point2d pt; + pt.x = center.x + x * alpha - y * beta; + pt.y = center.y + x * beta + y * alpha; + pts.push_back(pt); + } + + // If there are no points, it's a zero-size polygon + if( pts.size() == 1) { + pts.assign(2,center); + } +} +static void ThickLine(std::vector& regions, Size size, Point2l p0, Point2l p1, int thickness, int line_type, int flags, int shift); +static void PolyLine(std::vector& regions, Size size, const Point2l* v, int count, bool is_closed, int thickness, int line_type, int shift) { + if (!v || count <= 0) { + return; + } + + int i = is_closed ? count - 1 : 0; + int flags = 2 + !is_closed; + Point2l p0; + MNN_ASSERT(0 <= shift && shift <= XY_SHIFT && thickness >= 0); + + p0 = v[i]; + for (i = !is_closed; i < count; i++) { + Point2l p = v[i]; + ThickLine(regions, size, p0, p, thickness, line_type, flags, shift ); + p0 = p; + flags = 2; + } +} + +struct PolyEdge { + PolyEdge() : y0(0), y1(0), x(0), dx(0), next(0) {} + + int y0, y1; + int64_t x, dx; + PolyEdge *next; +}; + +static void CollectPolyEdges(std::vector& regions, Size size, const Point2l* v, int count, std::vector& edges, int line_type, int shift, Point2i offset = Point2i()) { + int delta = offset.y + ((1 << shift) >> 1); + Point2l pt0 = v[count-1], pt1; + pt0.x = (pt0.x + offset.x) << (XY_SHIFT - shift); + pt0.y = (pt0.y + delta) >> shift; + + edges.reserve(edges.size() + count); + + for (int i = 0; i < count; i++, pt0 = pt1) { + Point2l t0, t1; + PolyEdge edge; + + pt1 = v[i]; + pt1.x = (pt1.x + offset.x) << (XY_SHIFT - shift); + pt1.y = (pt1.y + delta) >> shift; + + t0.y = pt0.y; t1.y = pt1.y; + t0.x = (pt0.x + (XY_ONE >> 1)) >> XY_SHIFT; + t1.x = (pt1.x + (XY_ONE >> 1)) >> XY_SHIFT; + Line(regions, size, t0, t1, line_type); + + if (pt0.y == pt1.y) continue; + + if (pt0.y < pt1.y) { + edge.y0 = (int)(pt0.y); + edge.y1 = (int)(pt1.y); + edge.x = pt0.x; + } else { + edge.y0 = (int)(pt1.y); + edge.y1 = (int)(pt0.y); + edge.x = pt1.x; + } + edge.dx = (pt1.x - pt0.x) / (pt1.y - pt0.y); + edges.push_back(edge); + } +} + +static void FillEdgeCollection(std::vector& regions, Size size, std::vector& edges) { + PolyEdge tmp; + int i, y, total = (int)edges.size(); + PolyEdge* e; + int y_max = std::numeric_limits::min(), y_min = std::numeric_limits::max(); + int64_t x_max = 0xFFFFFFFFFFFFFFFF, x_min = 0x7FFFFFFFFFFFFFFF; + + if (total < 2) return; + + for (i = 0; i < total; i++) { + PolyEdge& e1 = edges[i]; + MNN_ASSERT(e1.y0 < e1.y1); + // Determine x-coordinate of the end of the edge. + // (This is not necessary x-coordinate of any vertex in the array.) + int64_t x1 = e1.x + (e1.y1 - e1.y0) * e1.dx; + y_min = std::min( y_min, e1.y0 ); + y_max = std::max( y_max, e1.y1 ); + x_min = std::min( x_min, e1.x ); + x_max = std::max( x_max, e1.x ); + x_min = std::min( x_min, x1 ); + x_max = std::max( x_max, x1 ); + } + + if (y_max < 0 || y_min >= size.height || x_max < 0 || x_min >= ((int64_t)size.width<::max(); + edges.push_back(tmp); // after this point we do not add + // any elements to edges, thus we can use pointers + i = 0; + tmp.next = 0; + e = &edges[i]; + y_max = MIN(y_max, size.height); + + for (y = e->y0; y < y_max; y++) { + PolyEdge *last, *prelast, *keep_prelast; + int sort_flag = 0; + int draw = 0; + int clipline = y < 0; + + prelast = &tmp; + last = tmp.next; + while (last || e->y0 == y) { + if (last && last->y1 == y) { + // exclude edge if y reaches its lower point + prelast->next = last->next; + last = last->next; + continue; + } + keep_prelast = prelast; + if (last && (e->y0 > y || last->x < e->x)) { + // go to the next edge in active list + prelast = last; + last = last->next; + } else if(i < total) { + // insert new edge into active list if y reaches its upper point + prelast->next = e; + e->next = last; + prelast = e; + e = &edges[++i]; + } else { + break; + } + + if (draw) { + if(!clipline) { + // convert x's from fixed-point to image coordinates + // uchar *timg = const_cast(img->readMap()) + (y * pix_size * w); + int x1, x2; + + if (keep_prelast->x > prelast->x) { + x1 = (int)((prelast->x + XY_ONE - 1) >> XY_SHIFT); + x2 = (int)(keep_prelast->x >> XY_SHIFT); + } else { + x1 = (int)((keep_prelast->x + XY_ONE - 1) >> XY_SHIFT); + x2 = (int)(prelast->x >> XY_SHIFT); + } + + // clip and draw the line + if( x1 < size.width && x2 >= 0 ) + { + if (x1 < 0) x1 = 0; + if (x2 >= size.width) x2 = size.width - 1; + regions.emplace_back(Region{y, x1, x2}); + } + } + keep_prelast->x += keep_prelast->dx; + prelast->x += prelast->dx; + } + draw ^= 1; + } + + // sort edges (using bubble sort) + keep_prelast = 0; + do { + prelast = &tmp; + last = tmp.next; + + while (last != keep_prelast && last->next != 0) { + PolyEdge *te = last->next; + // swap edges + if (last->x > te->x) { + prelast->next = te; + last->next = te->next; + te->next = last; + prelast = te; + sort_flag = 1; + } else { + prelast = last; + last = te; + } + } + keep_prelast = prelast; + } while(sort_flag && keep_prelast != tmp.next && keep_prelast != &tmp); + } +} + +static void EllipseEx(std::vector& regions, Size size, Point2l center, Size2l axes, int angle, int arc_start, int arc_end, int thickness, int line_type) { + axes.width = std::abs(axes.width), axes.height = std::abs(axes.height); + int delta = (int)((std::max(axes.width,axes.height)+(XY_ONE>>1))>>XY_SHIFT); + delta = delta < 3 ? 90 : delta < 10 ? 30 : delta < 15 ? 18 : 5; + + std::vector _v; + ellipse2Poly(Point2d((double)center.x, (double)center.y), Size2d((double)axes.width, (double)axes.height), angle, arc_start, arc_end, delta, _v); + + std::vector v; + Point2l prevPt(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF); + v.resize(0); + for (unsigned int i = 0; i < _v.size(); ++i) + { + Point2l pt; + pt.x = (int64_t)std::round(_v[i].x / XY_ONE) << XY_SHIFT; + pt.y = (int64_t)std::round(_v[i].y / XY_ONE) << XY_SHIFT; + pt.x += std::round(_v[i].x - pt.x); + pt.y += std::round(_v[i].y - pt.y); + if (pt != prevPt) { + v.push_back(pt); + prevPt = pt; + } + } + + // If there are no points, it's a zero-size polygon + if (v.size() == 1) { + v.assign(2, center); + } + + if (thickness >= 0) { + PolyLine(regions, size, &v[0], (int)v.size(), false, thickness, line_type, XY_SHIFT); + } else if( arc_end - arc_start >= 360 ) { + FillConvexPoly(regions, size, &v[0], (int)v.size(), line_type, XY_SHIFT); + } else { + v.push_back(center); + std::vector edges; + CollectPolyEdges(regions, size, &v[0], (int)v.size(), edges, line_type, XY_SHIFT); + FillEdgeCollection(regions, size, edges); + } +} + +static void Circle(std::vector& regions, Size size, Point2i center, int radius, int fill) { + int err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1; + int inside = center.x >= radius && center.x < size.width - radius && + center.y >= radius && center.y < size.height - radius; + + while (dx >= dy) { + int mask; + int y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx; + int x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy; + + if (inside) { + if(!fill) { + regions.emplace_back(Region{y11, x11}); + regions.emplace_back(Region{y12, x11}); + regions.emplace_back(Region{y11, x12}); + regions.emplace_back(Region{y12, x12}); + regions.emplace_back(Region{y21, x21}); + regions.emplace_back(Region{y22, x21}); + regions.emplace_back(Region{y21, x22}); + regions.emplace_back(Region{y22, x22}); + } else { + regions.emplace_back(Region{y11, x11, x12}); + regions.emplace_back(Region{y12, x11, x12}); + regions.emplace_back(Region{y21, x21, x22}); + regions.emplace_back(Region{y22, x21, x22}); + } + } else if (x11 < size.width && x12 >= 0 && y21 < size.height && y22 >= 0) { + if (fill) { + x11 = std::max(x11, 0); + x12 = MIN(x12, size.width - 1); + } + if ((unsigned)y11 < (unsigned)size.height) { + if (!fill) { + if(x11 >= 0) regions.emplace_back(Region{y11, x11}); + if(x12 < size.width) regions.emplace_back(Region{y11, x12}); + } else { + regions.emplace_back(Region{y11, x11, x12}); + } + } + if ((unsigned)y12 < (unsigned)size.height) { + if(!fill) { + if(x11 >= 0) regions.emplace_back(Region{y12, x11}); + if(x12 < size.width) regions.emplace_back(Region{y12, x12}); + } else { + regions.emplace_back(Region{y12, x11, x12}); + } + } + + if (x21 < size.width && x22 >= 0) { + if (fill) { + x21 = std::max(x21, 0); + x22 = MIN(x22, size.width - 1); + } + if ((unsigned)y21 < (unsigned)size.height) { + if(!fill) { + if(x21 >= 0) regions.emplace_back(Region{y21, x21}); + if(x22 < size.width) regions.emplace_back(Region{y21, x22}); + } else { + regions.emplace_back(Region{y21, x21, x22}); + } + } + if ((unsigned)y22 < (unsigned)size.height) { + if(!fill) { + if(x21 >= 0) regions.emplace_back(Region{y22, x21}); + if(x22 < size.width) regions.emplace_back(Region{y22, x22}); + } else { + regions.emplace_back(Region{y22, x21, x22}); + } + } + } + } + dy++; + err += plus; + plus += 2; + mask = (err <= 0) - 1; + err -= minus & mask; + dx += mask; + minus -= mask & 2; + } +} + +static void ThickLine(std::vector& regions, Size size, Point2l p0, Point2l p1, int thickness, int line_type, int flags, int shift) { + constexpr double INV_XY_ONE = 1./XY_ONE; + p0.x <<= XY_SHIFT - shift; + p0.y <<= XY_SHIFT - shift; + p1.x <<= XY_SHIFT - shift; + p1.y <<= XY_SHIFT - shift; + + if(thickness <= 1) { + if (line_type == 1 || line_type == 4 || shift == 0) { + p0.x = (p0.x + (XY_ONE>>1)) >> XY_SHIFT; + p0.y = (p0.y + (XY_ONE>>1)) >> XY_SHIFT; + p1.x = (p1.x + (XY_ONE>>1)) >> XY_SHIFT; + p1.y = (p1.y + (XY_ONE>>1)) >> XY_SHIFT; + Line(regions, size, p0, p1, line_type); + } else { + Line2(regions, size, p0, p1); + } + } else { + Point2l pt[4], dp = Point2i(0,0); + double dx = (p0.x - p1.x)*INV_XY_ONE, dy = (p1.y - p0.y)*INV_XY_ONE; + double r = dx * dx + dy * dy; + int i, oddThickness = thickness & 1; + thickness <<= XY_SHIFT - 1; + + if( fabs(r) > 2.2e-16 ) { + r = (thickness + oddThickness * XY_ONE * 0.5) / std::sqrt(r); + dp.x = std::round( dy * r ); + dp.y = std::round( dx * r ); + + pt[0].x = p0.x + dp.x; + pt[0].y = p0.y + dp.y; + pt[1].x = p0.x - dp.x; + pt[1].y = p0.y - dp.y; + pt[2].x = p1.x - dp.x; + pt[2].y = p1.y - dp.y; + pt[3].x = p1.x + dp.x; + pt[3].y = p1.y + dp.y; + FillConvexPoly(regions, size, pt, 4, line_type, XY_SHIFT); + } + + for(i = 0; i < 2; i++) { + if(flags & (i+1)) { + Point2i center; + center.x = (int)((p0.x + (XY_ONE>>1)) >> XY_SHIFT); + center.y = (int)((p0.y + (XY_ONE>>1)) >> XY_SHIFT); + Circle(regions, size, center, (thickness + (XY_ONE>>1)) >> XY_SHIFT, 1); + } + p0 = p1; + } + } +} + +template static inline +void scalarToRawData_(const Scalar& s, T * const buf, const int cn) { + for(int i = 0; i < cn; i++) { + buf[i] = static_cast(s.val[i]); + } +} + +void scalarToRawData(const Scalar& s, void* buf, VARP img) { + auto type = img->getInfo()->type; + int cn = getVARPChannel(img); + if (type == halide_type_of()) { + scalarToRawData_(s, (uchar*)buf, cn); + } else if (type == halide_type_of()) { + scalarToRawData_(s, (float*)buf, cn); + } else if (type == halide_type_of()) { + scalarToRawData_(s, (double*)buf, cn); + } else if (type == halide_type_of()) { + scalarToRawData_(s, (int*)buf, cn); + } +} + +std::vector mergeRegions(std::vector regions) { + std::vector res; + // 1. get line's region + std::map>> lines; + for (auto region : regions) { + if (lines.find(region.y) != lines.end()) { + lines[region.y].push_back({region.xl, region.xr}); + } else { + lines[region.y] = std::vector>(); + lines[region.y].push_back({region.xl, region.xr}); + } + } + // 2. merge line's region + for (auto line : lines) { + auto liner = line.second; + // sort line regions + std::sort(liner.begin(), liner.end(), [](const std::pair& a, const std::pair& b){return a.first < b.first;}); + // merge + res.emplace_back(Region{line.first, liner[0].first, liner[0].second}); + for (int i = 1; i < liner.size(); i++) { + if (res.back().xr >= liner[i].second) { + res.back().xr = MAX(res.back().xr, liner[i].second); + } else { + res.emplace_back(Region{line.first, liner[i].first, liner[i].second}); + } + } + } + return res; +} + +void doDraw(VARP& img, const std::vector& regions, const Scalar& color) { + double buf[4]; + scalarToRawData(color, buf, img); + auto mergeRegs = mergeRegions(regions); + ImageProcess::Config config; + config.draw = true; + std::unique_ptr process(ImageProcess::create(config)); + int h, w, c; getVARPSize(img, &h, &w, &c); + process->draw(img->writeMap(), w, h, c, reinterpret_cast(mergeRegs.data()), mergeRegs.size(), (uint8_t*)buf); + } void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color, @@ -76,17 +854,32 @@ void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color, line(img, p, pt2, color, thickness, line_type, shift); } +void circle(VARP& img, Point center, int radius, const Scalar& color, int thickness, int line_type, int shift) { + Point2i center_(static_cast(center.fX), static_cast(center.fY)); + int h, w, c; getVARPSize(img, &h, &w, &c); + Size size(w, h); + std::vector regions; + if( thickness > 1 || line_type != LINE_8 || shift > 0 ) { + Point2l _center(center_); + int64_t _radius(radius); + _center.x <<= XY_SHIFT - shift; + _center.y <<= XY_SHIFT - shift; + _radius <<= XY_SHIFT - shift; + EllipseEx(regions, size, _center, Size2l(_radius, _radius), 0, 0, 360, thickness, line_type); + } else { + Circle(regions, size, center_, radius, thickness < 0); + } + doDraw(img, regions, color); +} + void line(VARP& img, Point pt1, Point pt2, const Scalar& color, int thickness, int lineType, int shift) { - int h = 0, w = 0, c = 0; - getVARPSize(img, &h, &w, &c); - auto ptr = img->writeMap(); - int x1 = static_cast(pt1.fX), y1 = static_cast(pt1.fY); - int x2 = static_cast(pt2.fX), y2 = static_cast(pt2.fY); - for (int i = 0; i < thickness; i++) { - // bresenham(ptr, h, w, c, x1[i], y1[i], x2[i], y2[i], color); - } - bresenham(ptr, h, w, c, x1, y1, x2, y2, color); + int h, w, c; getVARPSize(img, &h, &w, &c); + Point2i p1(static_cast(pt1.fX), static_cast(pt1.fY)); + Point2i p2(static_cast(pt2.fX), static_cast(pt2.fY)); + std::vector regions; + ThickLine(regions, Size{w, h}, p1, p2, thickness, lineType, 3, shift); + doDraw(img, regions, color); } void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color, @@ -101,5 +894,77 @@ void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color, line(img, {pt1.fX, pt2.fY}, pt2, color, thickness, lineType); } +void drawContours(VARP& img, std::vector> _contours, int contourIdx, const Scalar& color, int thickness, int lineType) { + size_t ncontours = _contours.size(); + if (!ncontours) return; + int h, w, c; getVARPSize(img, &h, &w, &c); + Size size(w, h); + std::vector regions; + size_t i = 0, first = 0, last = ncontours; + if (contourIdx >= 0) { + first = contourIdx; + last = first + 1; + } + std::vector edges; + for (i = first; i < last; i++) { + const auto& contour = _contours[i]; + if (contour.empty()) continue; + std::vector pts; + for (int j = 0; j < contour.size(); j++) { + int nextj = j + 1 == contour.size() ? 0 : j + 1; + Point2l pt1(contour[j].fX, contour[j].fY), pt2(contour[nextj].fX, contour[nextj].fY); + if(thickness >= 0) { + ThickLine(regions, size, pt1, pt2, thickness, lineType, 2, 0); + } else { + if (!j) pts.push_back(pt1); + pts.push_back(pt2); + } + } + if (thickness < 0) { + CollectPolyEdges(regions, size, &pts[0], (int)pts.size(), edges, lineType, 0); + } + } + if (thickness < 0) { + FillEdgeCollection(regions, size, edges); + } + doDraw(img, regions, color); +} + +void fillPoly(VARP& img, std::vector> _pts, const Scalar& color, int line_type, int shift, Point _offset) { + int ncontours = _pts.size(); + if (!ncontours) return; + int h, w, c; + getVARPSize(img, &h, &w, &c); + Size size(w, h); + std::vector regions; + std::vector> pts(ncontours); + std::vector _ptsptr(ncontours); + std::vector _npts(ncontours); + Point2i** ptsptr = _ptsptr.data(); + int *npts = _npts.data(), total = 0; + for(int i = 0; i < ncontours; i++ ) { + int num = _pts[i].size(); + pts[i].resize(num); + for (int j = 0; j < num; j++) { + pts[i][j].x = _pts[i][j].fX; + pts[i][j].y = _pts[i][j].fY; + } + ptsptr[i] = pts[i].data(); + npts[i] = num; + total += num; + } + if(line_type == LINE_AA && img->getInfo()->type == halide_type_of()) line_type = 8; + MNN_ASSERT(ptsptr && npts && ncontours >= 0 && 0 <= shift && shift <= XY_SHIFT); + std::vector edges; + Point2i offset(_offset.fX, _offset.fY); + edges.reserve( total + 1 ); + for (int i = 0; i < ncontours; i++) { + std::vector _pts(ptsptr[i], ptsptr[i] + npts[i]); + CollectPolyEdges(regions, size, _pts.data(), npts[i], edges, line_type, shift, offset); + } + FillEdgeCollection(regions, size, edges); + doDraw(img, regions, color); +} + } // CV } // MNN diff --git a/tools/cv/source/imgproc/filter.cpp b/tools/cv/source/imgproc/filter.cpp index 0695c32a..328b7c5a 100644 --- a/tools/cv/source/imgproc/filter.cpp +++ b/tools/cv/source/imgproc/filter.cpp @@ -136,7 +136,7 @@ VARP dilate(VARP src, VARP kernel, int iterations, int borderType) { int kheight, kwidth, kchannel; getVARPSize(kernel, &kheight, &kwidth, &kchannel); auto padSrc = PadForConv(src, kheight, kwidth, borderType); - return _Squeeze(_MaxPool(padSrc, {3, 3}), {0}); + return _Squeeze(_MaxPool(padSrc, {kheight, kwidth}), {0}); } VARP filter2D(VARP src, int ddepth, VARP kernel, double delta, int borderType) { diff --git a/tools/cv/source/imgproc/geometric.cpp b/tools/cv/source/imgproc/geometric.cpp index 1ee1f340..4c63b505 100644 --- a/tools/cv/source/imgproc/geometric.cpp +++ b/tools/cv/source/imgproc/geometric.cpp @@ -56,8 +56,12 @@ Matrix getRotationMatrix2D(Point center, double angle, double scale) { return M; } -VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation) { +extern std::pair getSrcDstFormat(int code); +extern int format2Channel(CV::ImageFormat format); + +VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation, int code, std::vector mean, std::vector norm) { int ih, iw, ic; + auto type = src->getInfo()->type; getVARPSize(src, &ih, &iw, &ic); int oh = dsize.height, ow = dsize.width; if (!oh && !ow) { @@ -66,30 +70,55 @@ VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation) { } fx = static_cast(iw) / ow; fy = static_cast(ih) / oh; - auto dest = Tensor::create({1, oh, ow, ic}, halide_type_of()); ImageProcess::Config config; + // cvtColor + int oc = ic; + if (code >= 0) { + auto format = getSrcDstFormat(code); + config.sourceFormat = format.first; + config.destFormat = format.second; + oc = format2Channel(format.second); + } else { + ImageFormat format = RGB; + if (ic == 1) { + format = GRAY; + } else if (ic == 4) { + format = RGBA; + } + config.sourceFormat = format; + config.destFormat = format; + } + // toFloat + auto dstType = type; + if (!mean.empty() || !norm.empty()) { + for (int i = 0; i < mean.size() && i < 4; i++) { + config.mean[i] = mean[i]; + } + for (int i = 0; i < norm.size() && i < 4; i++) { + config.normal[i] = norm[i]; + } + dstType = halide_type_of(); + } config.filterType = static_cast(interpolation); - config.sourceFormat = RGB; - config.destFormat = RGB; std::unique_ptr process(ImageProcess::create(config)); + auto dest = Tensor::create({1, oh, ow, oc}, dstType); Matrix tr; tr.postScale(fx, fy); tr.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1)); process->setMatrix(tr); - process->convert(src->readMap(), iw, ih, 0, dest->host(), ow, oh, ic, 0, halide_type_of()); + process->convert(src->readMap(), iw, ih, 0, dest->host(), ow, oh, oc, 0, dstType); auto res = Express::Variable::create(Express::Expr::create(dest, true), 0); return _Squeeze(res, {0}); } -VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int borderValue) { +VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int borderValue, int code, std::vector mean, std::vector norm) { int ih, iw, ic; + auto type = src->getInfo()->type; getVARPSize(src, &ih, &iw, &ic); int oh = dsize.height, ow = dsize.width; - auto dest = Tensor::create({1, oh, ow, ic}, halide_type_of()); + // auto dest = Tensor::create({1, oh, ow, ic}, type); ImageProcess::Config config; config.filterType = flags < 3 ? static_cast(flags) : BILINEAR; - config.sourceFormat = RGB; - config.destFormat = RGB; switch (borderMode) { case BORDER_CONSTANT: config.wrap = ZERO; @@ -104,6 +133,35 @@ VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int b MNN_ERROR("Don't support borderMode!"); break; } + // cvtColor + int oc = ic; + if (code >= 0) { + auto format = getSrcDstFormat(code); + config.sourceFormat = format.first; + config.destFormat = format.second; + oc = format2Channel(format.second); + } else { + ImageFormat format = RGB; + if (ic == 1) { + format = GRAY; + } else if (ic == 4) { + format = RGBA; + } + config.sourceFormat = format; + config.destFormat = format; + } + // toFloat + auto dstType = type; + if (!mean.empty() || !norm.empty()) { + for (int i = 0; i < mean.size() && i < 4; i++) { + config.mean[i] = mean[i]; + } + for (int i = 0; i < norm.size() && i < 4; i++) { + config.normal[i] = norm[i]; + } + dstType = halide_type_of(); + } + auto dest = Tensor::create({1, oh, ow, oc}, dstType); std::unique_ptr process(ImageProcess::create(config)); if (flags != WARP_INVERSE_MAP) { bool invert = M.invert(&M); @@ -111,7 +169,7 @@ VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int b } process->setMatrix(M); process->setPadding(borderValue); - process->convert(src->readMap(), iw, ih, 0, dest->host(), ow, oh, ic, 0, halide_type_of()); + process->convert(src->readMap(), iw, ih, 0, dest->host(), ow, oh, oc, 0, dstType); auto res = Express::Variable::create(Express::Expr::create(dest, true), 0); return _Squeeze(res, {0}); } diff --git a/tools/cv/source/imgproc/structural.cpp b/tools/cv/source/imgproc/structural.cpp index de8b6162..b6d4e2f3 100644 --- a/tools/cv/source/imgproc/structural.cpp +++ b/tools/cv/source/imgproc/structural.cpp @@ -111,7 +111,7 @@ static CvContourScanner cvStartFindContours( VARP _img, CvPoint offset, int mode return scanner; } -static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int method, POINTS& points) +static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int method, std::vector& points) { const char nbd = 2; int deltas[16]; @@ -119,7 +119,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int /* initialize local state */ CV_INIT_3X3_DELTAS( deltas, step, 1); ::memcpy( deltas + 8, deltas, 8 * sizeof( deltas[0] )); - char *i0 = (ptr), *i1, *i3, *i4 = 0; + schar *i0 = (ptr), *i1, *i3, *i4 = 0; s_end = s = is_hole ? 0 : 4; do @@ -136,9 +136,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int *i0 = (schar) (nbd | -128); if( method >= 0 ) { - Point _p; - _p.set(pt.x, pt.y); - points.push_back(_p); + points.push_back(pt); } } else @@ -172,9 +170,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int { if( s != prev_s || method == 1 ) { - Point _p; - _p.set(pt.x, pt.y); - points.push_back(_p); + points.push_back(pt); prev_s = s; } @@ -192,7 +188,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int } } -static bool cvFindNextContour(CvContourScanner scanner, POINTS& points) +static bool cvFindNextContour(CvContourScanner scanner, std::vector& points) { /* initialize local state */ schar* img0 = scanner->img0; @@ -368,7 +364,11 @@ static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int n enum { CALIPERS_MAXHEIGHT=0, CALIPERS_MINAREARECT=1, CALIPERS_MAXDIST=2 }; static void rotatingCalipers( const Point2f* points, int n, int mode, float* out ) { +#ifdef _MSC_VER + float minarea = FLT_MAX; +#else float minarea = __FLT_MAX__; +#endif float max_dist = 0; char buffer[32] = {}; int i, k; @@ -2058,40 +2058,61 @@ LabelT LabelingGrana(VARP img, VARP& imgLabels, int connectivity, CCStatsOp& sop } /*Copy From OpenCV End*/ -std::vector findContours(VARP image, int mode, int method, Point offset) { +std::vector findContours(VARP image, int mode, int method, Point offset) { if (method > CHAIN_APPROX_SIMPLE) { MNN_ERROR("findContours: just support method = [CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE]."); } auto img = _Clone(image, true); CvPoint off((int)offset.fX, (int)offset.fY); auto info = cvStartFindContours(img, off, mode, method); - POINTS points; - std::vector contours; + std::vector points; + std::vector contours; while (cvFindNextContour(info, points)) { - contours.emplace_back(std::move(points)); + auto ptr = reinterpret_cast(points.data()); + contours.push_back(_Const(ptr, {static_cast(points.size()), 1, 2}, NHWC, halide_type_of())); + points.clear(); } // same to opencv std::reverse(contours.begin(), contours.end()); delete info; return contours; } -double contourArea(std::vector _contour, bool oriented) { - int npoints = _contour.size(); +double contourArea(VARP _contour, bool oriented) { + auto info = _contour->getInfo(); + int npoints = info->size / 2; if (!npoints) return 0; + bool is_float = info->type == halide_type_of(); + bool is_int = info->type == halide_type_of(); + MNN_ASSERT(is_float || is_int); double a00 = 0; - auto prev = _contour.back(); - for(int i = 0; i < npoints; i++) { - auto p = _contour[i]; - a00 += (double)prev.fX * p.fY - (double)prev.fY * p.fX; - prev = p; + float prevx, prevy; + if (is_float) { + auto ptr = _contour->readMap(); + prevx = ptr[npoints * 2 - 2], prevy = ptr[npoints * 2 - 1]; + for(int i = 0; i < npoints; i++) { + auto x = ptr[i * 2], y = ptr[i * 2 + 1]; + a00 += (double)prevx * y - (double)prevy * x; + prevx = x, prevy = y; + } + } else { + auto ptr = _contour->readMap(); + prevx = ptr[npoints * 2 - 2], prevy = ptr[npoints * 2 - 1]; + for(int i = 0; i < npoints; i++) { + float x = ptr[i * 2], y = ptr[i * 2 + 1]; + a00 += (double)prevx * y - (double)prevy * x; + prevx = x, prevy = y; + } } + a00 *= 0.5; if(!oriented) a00 = fabs(a00); return a00; } -std::vector convexHull(std::vector points, bool clockwise, bool returnPoints) { - int i, total = points.size(), nout = 0; +std::vector convexHull(VARP points, bool clockwise, bool returnPoints) { + auto info = points->getInfo(); + auto pointPtr = points->readMap(); + int i, total = info->size / 2, nout = 0; int miny_ind = 0, maxy_ind = 0; std::vector _hull; if( total == 0 ) @@ -2105,8 +2126,8 @@ std::vector convexHull(std::vector points, bool clockwise, bool retu int* stack = _stack.data(); int* hullbuf = _hullbuf.data(); for( i = 0; i < total; i++ ) { - _points[i].x = (int)points[i].fX; - _points[i].y = (int)points[i].fY; + _points[i].x = pointPtr[i * 2 + 0]; + _points[i].y = pointPtr[i * 2 + 1]; pointer[i] = reinterpret_cast(&_points[i]); } Point2i* data0 = pointer[0]; @@ -2228,8 +2249,8 @@ std::vector convexHull(std::vector points, bool clockwise, bool retu if( returnPoints ) { _hull.resize(nout * 2); for (int i = 0; i < nout; i++) { - _hull[2 * i] = (int)points[_hullbuf[i]].fX; - _hull[2 * i + 1] = (int)points[_hullbuf[i]].fY; + _hull[2 * i] = pointPtr[_hullbuf[i] * 2]; + _hull[2 * i + 1] = pointPtr[_hullbuf[i] * 2 + 1]; } } else { _hull.resize(nout); @@ -2239,7 +2260,7 @@ std::vector convexHull(std::vector points, bool clockwise, bool retu } return _hull; } -RotatedRect minAreaRect(std::vector _points) { +RotatedRect minAreaRect(VARP _points) { auto hull = convexHull(_points); int n = hull.size() / 2; Point2f out[3]; @@ -2271,30 +2292,34 @@ RotatedRect minAreaRect(std::vector _points) { box.angle = (float)(box.angle*180/MNN_PI); return box; } -Rect2i boundingRect(POINTS points) { - int npoints = points.size(); - int xmin = 0, ymin = 0, xmax = -1, ymax = -1; +Rect2i boundingRect(VARP points) { + auto info = points->getInfo(); + int npoints = info->size / 2; if( npoints == 0 ) return Rect2i(); - Point pt = points[0]; - xmin = xmax = pt.fX; - ymin = ymax = pt.fY; + bool is_float = info->type == halide_type_of(); + bool is_int = info->type == halide_type_of(); + MNN_ASSERT(is_float || is_int); + int xmin = 0, ymin = 0, xmax = -1, ymax = -1; + auto iptr = points->readMap(); + auto fptr = points->readMap(); + xmin = xmax = is_float ? fptr[0] : iptr[0]; + ymin = ymax = is_float ? fptr[1] : iptr[1]; + for(int i = 1; i < npoints; i++) { + int x = is_float ? fptr[2 * i] : iptr[2 * i]; + int y = is_float ? fptr[2 * i + 1] : iptr[2 * i + 1]; - for( int i = 1; i < npoints; i++ ) - { - pt = points[i]; + if( xmin > x ) + xmin = x; - if( xmin > pt.fX ) - xmin = pt.fX; + if( xmax < x ) + xmax = x; - if( xmax < pt.fX ) - xmax = pt.fX; + if( ymin > y ) + ymin = y; - if( ymin > pt.fY ) - ymin = pt.fY; - - if( ymax < pt.fY ) - ymax = pt.fY; + if( ymax < y ) + ymax = y; } return Rect2i(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); } @@ -2304,7 +2329,7 @@ int connectedComponentsWithStats(VARP image, VARP& labels, VARP& statsv, VARP& c return LabelingGrana(image, labels, connectivity, sop); } -POINTS boxPoints(RotatedRect box) { +VARP boxPoints(RotatedRect box) { std::vector pt(4); double _angle = box.angle*MNN_PI/180.; float b = (float)cos(_angle)*0.5f; @@ -2317,7 +2342,7 @@ POINTS boxPoints(RotatedRect box) { pt[2].fY = 2*box.center.y - pt[0].fY; pt[3].fX = 2*box.center.x - pt[1].fX; pt[3].fY = 2*box.center.y - pt[1].fY; - return pt; + return _Const(pt.data(), {4, 2}); } } // CV diff --git a/tools/cv/test/imgcodecs/codecs_test.cpp b/tools/cv/test/imgcodecs/codecs_test.cpp index 921f0139..4a76e776 100644 --- a/tools/cv/test/imgcodecs/codecs_test.cpp +++ b/tools/cv/test/imgcodecs/codecs_test.cpp @@ -11,7 +11,6 @@ #include "cv/imgcodecs.hpp" #include "test_env.hpp" -#define MNN_CODECS_TEST #ifdef MNN_CODECS_TEST static Env testEnv(img_name, false); diff --git a/tools/cv/test/imgproc/color_test.cpp b/tools/cv/test/imgproc/color_test.cpp index a8613011..66d47254 100644 --- a/tools/cv/test/imgproc/color_test.cpp +++ b/tools/cv/test/imgproc/color_test.cpp @@ -11,7 +11,6 @@ #include #include "test_env.hpp" -#define MNN_TEST_COLOR #ifdef MNN_TEST_COLOR static Env testEnv(img_name, false); diff --git a/tools/cv/test/imgproc/draw_test.cpp b/tools/cv/test/imgproc/draw_test.cpp index 471e56f2..f94e7f8d 100644 --- a/tools/cv/test/imgproc/draw_test.cpp +++ b/tools/cv/test/imgproc/draw_test.cpp @@ -11,36 +11,58 @@ #include "cv/imgcodecs.hpp" #include "test_env.hpp" -#define MNN_DRAW_TEST #ifdef MNN_DRAW_TEST static Env testEnv(img_name, false); -/* // arrowedLine TEST(arrowedLine, basic) { - cv::arrowedLine(testEnv.cvSrc, {10, 10}, {300, 200}, {0, 0, 255}); - arrowedLine(testEnv.mnnSrc, {10, 10}, {300, 200}, {0, 0, 255}); - // cv::imwrite("cv_line.jpg", testEnv.cvSrc); - // imwrite("mnn_line.jpg", testEnv.mnnSrc); - EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); -}*/ - -// line -TEST(line, basic) { - cv::line(testEnv.cvSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1); - line(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1); + cv::arrowedLine(testEnv.cvSrc, {10, 10}, {300, 200}, {0, 0, 255}, 1); + arrowedLine(testEnv.mnnSrc, {10, 10}, {300, 200}, {0, 0, 255}, 1); EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); } -/* -TEST(line, thickness) { - cv::line(testEnv.cvSrc, {10, 10}, {20, 20}, {0, 0, 255}, 1); - line(testEnv.mnnSrc, {10, 10}, {20, 20}, {0, 0, 255}, 1); - cv::imwrite("cv_line.jpg", testEnv.cvSrc); - imwrite("mnn_line.jpg", testEnv.mnnSrc); +TEST(arrowedLine, thickness) { + cv::arrowedLine(testEnv.cvSrc, {10, 10}, {30, 20}, {0, 0, 255}, 5); + arrowedLine(testEnv.mnnSrc, {10, 10}, {30, 20}, {0, 0, 255}, 5); EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); -}*/ +} + +// circle +TEST(circle, basic) { + cv::circle(testEnv.cvSrc, {50, 50}, 10, {0, 0, 255}, 1); + circle(testEnv.mnnSrc, {50, 50}, 10, {0, 0, 255}, 1); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} + +TEST(circle, thickness) { + cv::circle(testEnv.cvSrc, {100, 100}, 10, {0, 0, 255}, 5); + circle(testEnv.mnnSrc, {100, 100}, 10, {0, 0, 255}, 5); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} + +TEST(circle, fill) { + cv::circle(testEnv.cvSrc, {150, 150}, 10, {0, 0, 255}, -1); + circle(testEnv.mnnSrc, {150, 150}, 10, {0, 0, 255}, -1); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} + +// line +TEST(line, basic) { + // cv::line(testEnv.cvSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1); + // line(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1); + cv::line(testEnv.cvSrc, {10, 10}, {50, 50}, {0, 0, 255}, 1); + line(testEnv.mnnSrc, {10, 10}, {50, 50}, {0, 0, 255}, 1); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} + +TEST(line, thickness) { + cv::line(testEnv.cvSrc, {10, 10}, {20, 20}, {0, 0, 255}, 5); + line(testEnv.mnnSrc, {10, 10}, {20, 20}, {0, 0, 255}, 5); + // cv::imwrite("cv_line.jpg", testEnv.cvSrc); + // imwrite("mnn_line.jpg", testEnv.mnnSrc); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} // rectangle TEST(rectangle, basic) { @@ -48,5 +70,44 @@ TEST(rectangle, basic) { rectangle(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1); EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); } +// drawContours +TEST(drawContours, basic) { + cv::Mat gray, binary; + cv::cvtColor(testEnv.cvSrc, gray, cv::COLOR_BGR2GRAY); + cv::threshold(gray, binary, 127, 255, cv::THRESH_BINARY); + std::vector> cv_contours; + std::vector hierarchy; + cv::findContours(binary, cv_contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); + cv::drawContours(testEnv.cvSrc, cv_contours, -1, {0, 0, 255}, -1); + std::vector> mnn_contours(cv_contours.size()); + for (int i = 0; i < cv_contours.size(); i++) { + for (int j = 0; j < cv_contours[i].size(); j++) { + Point p; + p.set(cv_contours[i][j].x, cv_contours[i][j].y); + mnn_contours[i].push_back(p); + } + } + drawContours(testEnv.mnnSrc, mnn_contours, -1, {0, 0, 255}, -1); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} +TEST(fillPoly, basic) { + cv::Mat gray, binary; + cv::cvtColor(testEnv.cvSrc, gray, cv::COLOR_BGR2GRAY); + cv::threshold(gray, binary, 127, 255, cv::THRESH_BINARY); + std::vector> cv_contours; + std::vector hierarchy; + cv::findContours(binary, cv_contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); + cv::fillPoly(testEnv.cvSrc, cv_contours, {0, 0, 255}); + std::vector> mnn_contours(cv_contours.size()); + for (int i = 0; i < cv_contours.size(); i++) { + for (int j = 0; j < cv_contours[i].size(); j++) { + Point p; + p.set(cv_contours[i][j].x, cv_contours[i][j].y); + mnn_contours[i].push_back(p); + } + } + fillPoly(testEnv.mnnSrc, mnn_contours, {0, 0, 255}); + EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc)); +} #endif diff --git a/tools/cv/test/imgproc/filter_test.cpp b/tools/cv/test/imgproc/filter_test.cpp index a0d6b25c..f5f98785 100644 --- a/tools/cv/test/imgproc/filter_test.cpp +++ b/tools/cv/test/imgproc/filter_test.cpp @@ -10,7 +10,6 @@ #include #include "test_env.hpp" -#define MNN_TEST_FILTER #ifdef MNN_TEST_FILTER static Env testEnv(img_name, true); diff --git a/tools/cv/test/imgproc/geometric_test.cpp b/tools/cv/test/imgproc/geometric_test.cpp index 06b21c0a..811c33ac 100644 --- a/tools/cv/test/imgproc/geometric_test.cpp +++ b/tools/cv/test/imgproc/geometric_test.cpp @@ -10,7 +10,6 @@ #include #include "test_env.hpp" -#define MNN_GEOMETRIC_TEST #ifdef MNN_GEOMETRIC_TEST static Env testEnv(img_name, false); diff --git a/tools/cv/test/imgproc/miscellaneous_test.cpp b/tools/cv/test/imgproc/miscellaneous_test.cpp index e80e185a..a429d1cf 100644 --- a/tools/cv/test/imgproc/miscellaneous_test.cpp +++ b/tools/cv/test/imgproc/miscellaneous_test.cpp @@ -10,7 +10,6 @@ #include #include "test_env.hpp" -#define MNN_MISCELLANEOUS_TEST #ifdef MNN_MISCELLANEOUS_TEST static Env testEnv("img_name", true); diff --git a/tools/cv/test/imgproc/structral_test.cpp b/tools/cv/test/imgproc/structral_test.cpp index 1366a7cc..e633a279 100644 --- a/tools/cv/test/imgproc/structral_test.cpp +++ b/tools/cv/test/imgproc/structral_test.cpp @@ -10,7 +10,6 @@ #include #include "test_env.hpp" -#define MNN_STRUCTRAL_TEST #ifdef MNN_STRUCTRAL_TEST static Env testEnv(img_name, false); @@ -28,13 +27,14 @@ static std::vector img = { 0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0 }; -static void cmpContours(std::vector> x, std::vector> y) { +static void cmpContours(std::vector x, std::vector> y) { ASSERT_EQ(x.size(), y.size()); for (int i = 0; i < x.size(); i++) { - ASSERT_EQ(x[i].size(), y[i].size()); - for (int j = 0; j < x[i].size(); j++) { - ASSERT_EQ(x[i][j].fX, (float)y[i][j].x); - ASSERT_EQ(x[i][j].fY, (float)y[i][j].y); + ASSERT_EQ(x[i]->getInfo()->size / 2, y[i].size()); + auto ptr = x[i]->readMap(); + for (int j = 0; j < y[i].size(); j++) { + ASSERT_EQ(ptr[j * 2 + 0], y[i][j].x); + ASSERT_EQ(ptr[j * 2 + 1], y[i][j].y); } } } @@ -81,16 +81,8 @@ TEST(findContours, list_simple) { } TEST(contourArea, basic) { - std::vector cv_contour; - cv_contour.push_back(cv::Point2i(0, 0)); - cv_contour.push_back(cv::Point2i(10, 0)); - cv_contour.push_back(cv::Point2i(10, 10)); - cv_contour.push_back(cv::Point2i(5, 4)); - std::vector mnn_contour; - mnn_contour.push_back({0, 0}); - mnn_contour.push_back({10, 0}); - mnn_contour.push_back({10, 10}); - mnn_contour.push_back({5, 4}); + std::vector cv_contour = { {0, 0}, {10, 0}, {10, 10}, {5, 4}}; + VARP mnn_contour = _Const(cv_contour.data(), {4, 2}, NHWC, halide_type_of()); double x = contourArea(mnn_contour); double y = cv::contourArea(cv_contour); ASSERT_EQ(x, y); @@ -99,7 +91,7 @@ TEST(contourArea, basic) { #define TEST_POINTS { {0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3} } TEST(convexHull, indices) { std::vector cv_contour = TEST_POINTS; - std::vector mnn_contour = TEST_POINTS; + VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of()); auto x = convexHull(mnn_contour, false, false); std::vector y; cv::convexHull(cv_contour, y, false, false); @@ -107,7 +99,7 @@ TEST(convexHull, indices) { } TEST(convexHull, pointers) { std::vector cv_contour = TEST_POINTS; - std::vector mnn_contour = TEST_POINTS; + VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of()); auto x = convexHull(mnn_contour, false, true); cv::Mat y = cv::Mat(1, 4, CV_32S); cv::convexHull(cv_contour, y, false, true); @@ -117,7 +109,7 @@ TEST(convexHull, pointers) { } TEST(minAreaRect, basic) { std::vector cv_contour = TEST_POINTS; - std::vector mnn_contour = TEST_POINTS; + VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of()); auto x = minAreaRect(mnn_contour); auto y = cv::minAreaRect(cv_contour); ASSERT_NEAR(x.center.x, y.center.x, 1e-4); @@ -132,7 +124,7 @@ TEST(minAreaRect, basic) { } TEST(boundingRect, basic) { std::vector cv_contour = TEST_POINTS; - std::vector mnn_contour = TEST_POINTS; + VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of()); auto x = boundingRect(mnn_contour); auto y = cv::boundingRect(cv_contour); ASSERT_EQ(x.x, y.x); @@ -155,17 +147,20 @@ TEST(connectedComponentsWithStats, basic) { } TEST(boxPoints, basic) { std::vector cv_contour = TEST_POINTS; - std::vector mnn_contour = TEST_POINTS; + VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of()); auto x = minAreaRect(mnn_contour); auto y = cv::minAreaRect(cv_contour); - auto mnn_points = boxPoints(x); + auto _mnn_points = boxPoints(x); cv::Mat _cv_points; cv::boxPoints(y, _cv_points); - auto ptr = reinterpret_cast(_cv_points.data); - std::vector cv_points(4); + auto cvptr = reinterpret_cast(_cv_points.data); + auto mnnptr = _mnn_points->readMap(); + std::vector cv_points(4), mnn_points(4); for (int i = 0; i < 4; i++) { - cv_points[i].fX = ptr[2 * i + 0]; - cv_points[i].fY = ptr[2 * i + 1]; + cv_points[i].fX = cvptr[2 * i + 0]; + cv_points[i].fY = cvptr[2 * i + 1]; + mnn_points[i].fX = mnnptr[2 * i + 0]; + mnn_points[i].fY = mnnptr[2 * i + 1]; } auto comp = [](Point p1, Point p2) { return p1.fX < p2.fX; }; std::sort(mnn_points.begin(), mnn_points.end(), comp); diff --git a/tools/cv/test/test_env.hpp b/tools/cv/test/test_env.hpp index 474c5c42..ee4c3257 100644 --- a/tools/cv/test/test_env.hpp +++ b/tools/cv/test/test_env.hpp @@ -9,6 +9,16 @@ #ifndef TEST_ENV_HPP #define TEST_ENV_HPP +// macro flags for module test +#define MNN_CODECS_TEST +#define MNN_TEST_COLOR +#define MNN_DRAW_TEST +#define MNN_TEST_FILTER +#define MNN_GEOMETRIC_TEST +#define MNN_MISCELLANEOUS_TEST +#define MNN_STRUCTRAL_TEST +#define MNN_DRAW_TEST + #include #include #include diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp index c55de77c..8e76a05f 100644 --- a/tools/quantization/calibration.cpp +++ b/tools/quantization/calibration.cpp @@ -959,7 +959,7 @@ void Calibration::_quantizeModelEMA() { model->setIsTraining(false); exe->gc(Executor::PART); VARP forwardInput = nullptr; - if (originInfo != nullptr) { + if (originInfo != nullptr && originDims.size() > 0) { forwardInput = _Input(originDims, originFormat, originType); } else { if (_inputType == Helper::InputType::IMAGE) { diff --git a/tools/script/formatLicence.py b/tools/script/formatLicence.py index 3723589b..8da0bd64 100644 --- a/tools/script/formatLicence.py +++ b/tools/script/formatLicence.py @@ -13,7 +13,7 @@ ignore_files = [ "CPUFixedPoint.hpp", "OptimizedComputer.hpp", "OptimizedComputer.cpp", "AllShader.h", "AllShader.cpp", "VulkanShaderMap.cpp" ] -all_exts = [".c", ".cpp", ".h", ".hpp", ".m", ".mm", ".s", ".metal"] +all_exts = [".c", ".cpp", ".h", ".hpp", ".m", ".mm", ".s", ".metal", ".cuh", '.cu'] header_template = \ "//\n" + \ diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py index 5cfda8ff..165e6a7a 100755 --- a/tools/script/modelTest.py +++ b/tools/script/modelTest.py @@ -18,7 +18,7 @@ if len(sys.argv) > 5: runStatic = True gWrong = [] -convert = './MNNConvert -f MNN --bizCode MNN --saveStaticModel --modelFile ' +convert = ('MNNConvert.exe' if os.name == 'nt' else './MNNConvert') + ' -f MNN --bizCode MNN --saveStaticModel --modelFile ' tmpModel = '__tmpModel__.mnn' dynamic_size = 0 static_size = 0 diff --git a/tools/train/source/nn/NN.cpp b/tools/train/source/nn/NN.cpp index deb9a2f8..560058b4 100644 --- a/tools/train/source/nn/NN.cpp +++ b/tools/train/source/nn/NN.cpp @@ -11,6 +11,7 @@ #include "module/PipelineModule.hpp" #include "module/WhileModule.hpp" #include "module/IfModule.hpp" +#include "module/NMSModule.hpp" #include "Initializer.hpp" #include "MNN_generated.h" #include "RandomGenerator.hpp" @@ -528,6 +529,9 @@ Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::mapget()->type() == OpType_If) { return IfModule::create(expr->get(), subgraphs); } + if (expr->get()->type() == OpType_NonMaxSuppressionV2) { + return NMSModule::create(expr->get()); + } return nullptr; }