From 0c718e552b2e9b3c6724cb25257c8b96f93d87ec Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Fri, 18 Feb 2022 11:30:27 +0800
Subject: [PATCH] [Sync] Sync internal Gitlab

---
 CMakeLists.txt                                | 103 +-
 README.md                                     |  12 +-
 README_CN.md                                  |  14 +-
 express/CMakeLists.txt                        |   4 +
 express/Executor.cpp                          |   1 +
 express/Expr.cpp                              |  23 +
 express/MathOp.cpp                            |   8 +
 express/NeuralNetWorkOp.cpp                   |  68 ++
 express/module/Module.cpp                     |  18 +-
 express/module/NMSModule.hpp                  |   2 +-
 include/MNN/ImageProcess.hpp                  |  13 +
 include/MNN/Interpreter.hpp                   |   8 +-
 include/MNN/expr/Executor.hpp                 |   2 +-
 include/MNN/expr/MathOp.hpp                   |   1 +
 include/MNN/expr/NeuralNetWorkOp.hpp          |  10 +-
 package_scripts/linux/build_whl.sh            |   4 +-
 package_scripts/mac/build_whl.sh              |   8 +-
 package_scripts/win/build_bridge.ps1          | 236 +++--
 package_scripts/win/build_lib.ps1             | 160 +--
 package_scripts/win/build_tools.ps1           |  81 +-
 package_scripts/win/build_whl.ps1             |  45 +-
 project/ios/MNN.xcodeproj/project.pbxproj     |  34 +-
 project/ios/Playground/AppDelegate.mm         |  63 +-
 pymnn/CMakeLists.txt                          |  74 +-
 .../examples/MNNEngineDemo/mobilenet_demo.py  |   2 +-
 pymnn/pip_package/MNN/cv/__init__.py          |  41 +
 pymnn/pip_package/MNN/expr/__init__.py        | 167 ++-
 pymnn/pip_package/MNN/numpy/__init__.py       |  73 +-
 pymnn/pip_package/build_deps.py               |   9 +-
 pymnn/pip_package/build_wheel.py              |   6 +
 pymnn/pip_package/setup.py                    | 110 +-
 pymnn/src/MNN.cc                              | 225 ++++-
 pymnn/src/MNNPyBridge.h                       |  10 +-
 pymnn/src/cv.h                                | 177 +++-
 pymnn/src/expr.h                              | 242 ++++-
 pymnn/src/nn.h                                | 155 ++-
 pymnn/src/util.h                              | 309 +++---
 pymnn/test/model_test.py                      |   7 +-
 pymnn/test/unit_test.py                       |  77 +-
 schema/current/UserDefine_generated.h         |  28 +-
 schema/default/UserDefine.fbs                 |   1 +
 source/backend/cpu/CPUBackend.cpp             |   2 +-
 source/backend/cpu/CPUImageProcess.cpp        |  62 +-
 source/backend/cpu/CPUImageProcess.hpp        |  12 +-
 source/backend/cpu/CPUNonMaxSuppressionV2.cpp |   3 +
 source/backend/cpu/CPUResizeCache.hpp         |   9 +-
 source/backend/cpu/CPUScatterNd.cpp           |   9 +-
 .../cpu/compute/ImageProcessFunction.cpp      |  18 +
 .../cpu/compute/ImageProcessFunction.hpp      |   4 +
 source/backend/cpu/x86_x64/CMakeLists.txt     |  86 +-
 .../_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S  |  27 +-
 ..._AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S |  27 +-
 .../_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S  |  53 +-
 .../_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S  |  50 +-
 .../x86_x64/avx512/SparseKernelFunction.hpp   |  66 +-
 .../avx512/SparseKernelFunctionEpx1.cpp       |   7 +-
 .../avx512/SparseKernelFunctionEpx4.cpp       |  22 +-
 .../avx512/SparseKernelFunctionEpx8.cpp       |  22 +-
 .../avx512/_AVX512_MNNGemmFloatUnit16x8.S     |  29 +-
 .../avx512/_AVX512_MNNGemmFloatUnit32x8.S     |  29 +-
 .../avx512/_AVX512_MNNGemmFloatUnit48x8.S     |  30 +-
 .../_AVX512_MNNGemmFloatUnit48x8Fused.S       |  56 +-
 .../_AVX512_MNNPackedSparseMatMulEpx4.S       |  90 +-
 .../x86_x64/avx512/_AVX512_TransposeMain.S    |  24 +-
 .../avxfma/_AVX_MNNGemmFloatUnitMainFMA.S     |  25 +-
 .../avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S |  25 +-
 .../_AVX_MNNGemmFloatUnitMainFMA_Fused.S      |  49 +
 .../_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S  |  51 +-
 .../_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S  |  51 +-
 source/backend/cuda/CMakeLists.txt            |   6 +-
 source/backend/cuda/core/CUDABackend.cpp      | 401 +++++++-
 source/backend/cuda/core/CUDABackend.hpp      |  10 +-
 .../backend/cuda/core/runtime/CUDARuntime.cpp |  60 +-
 .../backend/cuda/core/runtime/CUDARuntime.hpp |  56 +-
 .../cuda/execution/BatchMatMulExecution.cu    | 119 ---
 .../cuda/execution/BatchMatMulExecution.hpp   |  23 -
 .../backend/cuda/execution/BinaryExecution.cu |   7 +-
 .../cuda/execution/ConvDepthWiseExecution.cu  | 584 ++++++-----
 .../cuda/execution/ConvDepthWiseExecution.hpp |  33 +-
 .../execution/ConvSingleInputExecution.cu     | 203 ++--
 .../execution/ConvSingleInputExecution.hpp    |   6 +-
 .../execution/DeconvSingleInputExecution.cu   | 450 +++++----
 .../execution/DeconvSingleInputExecution.hpp  |  99 +-
 source/backend/cuda/execution/ImageColumn.cu  | 705 +++++++++++++
 source/backend/cuda/execution/ImageColumn.cuh |  24 +
 .../backend/cuda/execution/InterpExecution.cu | 156 ++-
 .../cuda/execution/LayerNormExecution.cu      |  61 +-
 .../{CUDALoop.cpp => LoopExecution.cpp}       |  60 +-
 .../backend/cuda/execution/MNNCUDADefine.hpp  |  18 +
 .../cuda/execution/MNNCUDAFunction.cuh        |  38 +
 .../backend/cuda/execution/MatMulExecution.cu |  83 +-
 .../cuda/execution/MatMulExecution.hpp        |   1 +
 .../backend/cuda/execution/PReLUExecution.cu  |  43 +-
 .../backend/cuda/execution/PReLUExecution.hpp |   4 +-
 .../backend/cuda/execution/PoolExecution.cu   | 257 ++++-
 source/backend/cuda/execution/Raster.cu       | 687 +++++++++----
 source/backend/cuda/execution/Raster.cuh      |  15 +-
 .../cuda/execution/RasterExecution.cpp        | 448 +++++++--
 .../cuda/execution/RasterExecution.hpp        |  36 +-
 .../cuda/execution/ReductionExecution.cu      | 172 ++--
 .../cuda/execution/ReductionExecution.hpp     |   4 +
 .../cuda/execution/ReductionTemplate.cuh      |  93 ++
 .../backend/cuda/execution/ScaleExecution.cu  |  60 +-
 .../backend/cuda/execution/ScaleExecution.hpp |   6 +-
 .../backend/cuda/execution/SelectExecution.cu |   7 +-
 .../cuda/execution/SoftmaxExecution.cu        | 159 ++-
 .../cuda/execution/SoftmaxExecution.hpp       |  16 +-
 .../backend/cuda/execution/TensorCoreGemm.cu  | 219 +++-
 .../backend/cuda/execution/TensorCoreGemm.cuh |  15 +-
 .../cuda/execution/TensorCoreGemmPacked.cu    | 184 ++++
 .../cuda/execution/TensorCoreGemmPacked.cuh   |   8 +
 source/backend/cuda/execution/Transpose.cu    | 291 ++++++
 source/backend/cuda/execution/Transpose.cuh   |  44 +
 .../backend/cuda/execution/UnaryExecution.cu  |  94 +-
 source/backend/metal/MetalBackend.hpp         |   4 +-
 source/backend/metal/MetalBackend.mm          |   4 +-
 .../opencl/core/runtime/OpenCLWrapper.cpp     |  10 +-
 .../vulkan/component/VulkanMemoryPool.cpp     |   4 +-
 source/common/WinogradInt8Helper.hpp          |   2 +-
 source/core/BufferAllocator.cpp               |  20 +-
 source/core/BufferAllocator.hpp               |  22 +-
 source/core/Interpreter.cpp                   |  76 +-
 source/cv/ImageProcess.cpp                    |  25 +-
 source/geometry/GeometryGather.cpp            |   4 +-
 source/geometry/GeometryOPRegister.cpp        |   2 -
 source/geometry/GeometrySelect.cpp            |   2 +-
 source/geometry/GeometryShape.cpp             |  43 +
 source/geometry/GeometryStridedSlice.cpp      |  26 +
 source/shape/ShapeRegister.cpp                |   2 +
 source/shape/ShapeReshape.cpp                 |   4 +-
 source/shape/ShapeResize.cpp                  |   8 +-
 source/shape/ShapeScatterNd.cpp               |   2 +-
 source/shape/ShapeShape.cpp                   |  28 +
 source/shape/ShapeStridedSlice.cpp            |   9 +-
 source/shape/ShapeWhere.cpp                   |   7 +-
 test.bat                                      |   7 +
 test.ps1                                      | 233 +++++
 test/CMakeLists.txt                           |   3 +
 test/MNNTestSuite.cpp                         |   8 +-
 test/MNNTestSuite.h                           |   5 +-
 test/core/BackendTest.cpp                     | 105 +-
 test/core/BufferAllocatorTest.cpp             |   3 +-
 test/expr/MatMulTest.cpp                      |   2 +-
 test/expr/ZeroShapeTest.cpp                   |   4 +-
 test/main.cpp                                 |   6 +-
 test/op/RasterTest.cpp                        |  43 +
 test/op/SelectTest.cpp                        |  19 +-
 test/op/SortTest.cpp                          |  92 ++
 test/op/StridedSliceTest.cpp                  |  20 +
 test/op/UnaryTest.cpp                         |   2 +-
 tools/converter/CMakeLists.txt                |  20 +-
 tools/converter/source/onnx/IfOnnx.cpp        |  12 +-
 tools/converter/source/onnx/LoopOnnx.cpp      |   4 +
 tools/converter/source/onnx/onnxConverter.cpp |   4 +-
 tools/converter/source/optimizer/Program.cpp  |   4 +
 .../source/optimizer/merge/ConvBiasAdd.cpp    |   2 +-
 .../optimizer/merge/ConvertMatMulToConv2D.cpp |  13 +-
 .../source/optimizer/merge/MergeHelpers.cpp   |   3 +
 .../optimizer/merge/TensorConverterMerge.cpp  |   2 +-
 .../source/optimizer/onnxextra/OnnxClip.cpp   |  30 +-
 .../optimizer/onnxextra/OnnxLSTMMerge.cpp     |  16 +-
 .../onnxextra/OnnxNonMaxSuppression.cpp       |  15 +-
 .../onnxextra/OnnxSequenceGRUMerge.cpp        |  13 +-
 .../source/optimizer/passes/Pass.hpp          |   1 +
 .../source/optimizer/passes/PassRegistry.cpp  |   8 -
 .../postconvert/AddTensorFormatConverter.cpp  |   3 +
 .../optimizer/postconvert/ReIndexTensor.cpp   |   6 +
 tools/cpp/MNNV2Basic.cpp                      |   2 +
 tools/cpp/backendTest.cpp                     |   6 +-
 tools/cpp/testModelWithDescrisbe.cpp          |   8 +-
 tools/cv/CMakeLists.txt                       |  12 +-
 tools/cv/include/cv/imgproc/draw.hpp          |   8 +-
 tools/cv/include/cv/imgproc/geometric.hpp     |   7 +-
 tools/cv/include/cv/imgproc/structural.hpp    |  12 +-
 tools/cv/include/cv/types.hpp                 |  50 +-
 tools/cv/source/imgcodecs/imgcodecs.cpp       |  15 +-
 tools/cv/source/imgproc/color.cpp             |   5 +-
 tools/cv/source/imgproc/draw.cpp              | 951 +++++++++++++++++-
 tools/cv/source/imgproc/filter.cpp            |   2 +-
 tools/cv/source/imgproc/geometric.cpp         |  78 +-
 tools/cv/source/imgproc/structural.cpp        | 119 ++-
 tools/cv/test/imgcodecs/codecs_test.cpp       |   1 -
 tools/cv/test/imgproc/color_test.cpp          |   1 -
 tools/cv/test/imgproc/draw_test.cpp           | 101 +-
 tools/cv/test/imgproc/filter_test.cpp         |   1 -
 tools/cv/test/imgproc/geometric_test.cpp      |   1 -
 tools/cv/test/imgproc/miscellaneous_test.cpp  |   1 -
 tools/cv/test/imgproc/structral_test.cpp      |  47 +-
 tools/cv/test/test_env.hpp                    |  10 +
 tools/quantization/calibration.cpp            |   2 +-
 tools/script/formatLicence.py                 |   2 +-
 tools/script/modelTest.py                     |   2 +-
 tools/train/source/nn/NN.cpp                  |   4 +
 193 files changed, 9361 insertions(+), 2733 deletions(-)
 delete mode 100644 source/backend/cuda/execution/BatchMatMulExecution.cu
 delete mode 100644 source/backend/cuda/execution/BatchMatMulExecution.hpp
 create mode 100644 source/backend/cuda/execution/ImageColumn.cu
 create mode 100644 source/backend/cuda/execution/ImageColumn.cuh
 rename source/backend/cuda/execution/{CUDALoop.cpp => LoopExecution.cpp} (88%)
 create mode 100644 source/backend/cuda/execution/MNNCUDADefine.hpp
 create mode 100644 source/backend/cuda/execution/MNNCUDAFunction.cuh
 create mode 100644 source/backend/cuda/execution/ReductionTemplate.cuh
 create mode 100644 source/backend/cuda/execution/TensorCoreGemmPacked.cu
 create mode 100644 source/backend/cuda/execution/TensorCoreGemmPacked.cuh
 create mode 100644 source/backend/cuda/execution/Transpose.cu
 create mode 100644 source/backend/cuda/execution/Transpose.cuh
 create mode 100644 test.bat
 create mode 100644 test.ps1
 create mode 100644 test/op/RasterTest.cpp
 create mode 100644 test/op/SortTest.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2aaa9656..b7fe8136 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,9 +24,14 @@ add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}")
 add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}")
 add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}")
 
-# CMP0048 is related to letting CMake managing the package version for us
-
-cmake_policy(SET CMP0048 NEW)
+# Clear VERSION variables when no VERSION is given to project()
+if(POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif()
+# MSVC runtime library flags are selected by an abstraction.
+if(POLICY CMP0091)
+  cmake_policy(SET CMP0091 NEW)
+endif()
 project(MNN VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
@@ -35,14 +40,6 @@ set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
-#add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
-#    COMMAND ${CMAKE_COMMAND} "-DNAMES=MNN"
-#    "-DMNN_SOURCE_DIR=${CMAKE_CURRENT_LIST_DIR}"
-#    "-DHEADER_FILE=${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
-#    -P "${CMAKE_CURRENT_LIST_DIR}/cmake/GenerateVersionFromVCS.cmake"
-#    COMMENT "Generating Version Control Info"
-#)
-#add_custom_target (GenVCSHDR DEPENDS "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h")
 # Required for OpenCL/OpenGL/Vulkan CodeGen
 include(FindPythonInterp REQUIRED)
 # build options
@@ -107,8 +104,8 @@ IF(WIN32)
     SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
     SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
 
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
   ENDIF()
 ENDIF()
 
@@ -118,13 +115,54 @@ IF( MNN_ENABLE_COVERAGE)
     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
 ENDIF()
 
+# do this before protobuf, make sure wincrt config of protobuf and MNN is same
+if(MSVC)
+    # same as protobuf, otherwise config is inconsistent
+    if(CMAKE_VERSION VERSION_GREATER 3.15 OR CMAKE_VERSION VERSION_EQUAL 3.15)
+      set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>)
+      if(NOT MNN_WIN_RUNTIME_MT)
+        set(CMAKE_MSVC_RUNTIME_LIBRARY ${CMAKE_MSVC_RUNTIME_LIBRARY}DLL)
+      endif()
+    else()
+      foreach(flag_var
+          CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+          CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+          if (MNN_WIN_RUNTIME_MT)
+              if(${flag_var} MATCHES "/MD")
+                  string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+              endif()
+          else ()
+              if(${flag_var} MATCHES "/MT")
+                  string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+              endif()
+          endif ()
+      endforeach()
+    endif()
+    set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)
 IF(MNN_BUILD_PROTOBUFFER)
 IF(MNN_BUILD_CONVERTER)
+  IF(MSVC)
+    set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
+    IF((NOT MNN_BUILD_SHARED_LIBS) AND (NOT MNN_WIN_RUNTIME_MT))
+      message(FATAL_ERROR "When MNN_BUILD_CONVERTER=ON and MNN_BUILD_SHARED_LIBS=OFF, MNN_WIN_RUNTIME_MT must be ON. Because protobuf not support the config(static /MD)")
+    ENDIF()
+  ENDIF()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/3rd_party/protobuf/cmake)
 ENDIF()
 ENDIF()
 
+# specify source file encoding explicitly, fix cross-platform garbled output issue
+# we need do this after protobuf which set different execution-charset
+IF(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
+ENDIF()
+
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
   SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
   SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
@@ -206,26 +244,7 @@ message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
 
-if(MSVC)
-    if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
-      message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
-    endif()
-    foreach(flag_var
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if (MNN_WIN_RUNTIME_MT)
-            if(${flag_var} MATCHES "/MD")
-                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-            endif()
-        else ()
-            if(${flag_var} MATCHES "/MT")
-                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
-            endif()
-        endif ()
-    endforeach()
-elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
+if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
     add_definitions(-fPIC)
 endif()
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
@@ -561,6 +580,9 @@ if (MNN_INTERNAL)
     target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED)
     include(${CMAKE_CURRENT_LIST_DIR}/source/internal/auth/CMakeLists.txt)
     include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt)
+    if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
+        list(APPEND MNN_EXTRA_DEPENDS "-lcurl -lssl -lcrypto")
+    endif()
 endif()
 
 # Train
@@ -661,7 +683,18 @@ if(APPLE)
 endif()
 add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCPU)
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)
+IF(WIN32 AND MNN_BUILD_CONVERTER AND MNN_BUILD_SHARED_LIBS)
+# Because of dllimport/dllexport, we merge MNN and MNNConvertDeps together, which depend protobuf
+  target_link_libraries(MNN PUBLIC ${Protobuf_LIBRARIES})
+ENDIF()
+# Merge MNN/MNNExpress/MNNOpenCV and other backends into one .lib/.dll on Windows
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/cv)
+IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
+  IF(MSVC)
+    target_compile_definitions(MNNOpenCV PRIVATE "-DBUILDING_MNN_DLL" INTERFACE "-DUSING_MNN_DLL")
+  ENDIF()
+  target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
+ENDIF()
 
 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
 # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
@@ -753,6 +786,10 @@ ELSE()
       ARCHIVE DESTINATION lib
       FRAMEWORK DESTINATION /Library/Frameworks/
   )
+  if (NOT MNN_AAPL_FMWK)
+      INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
+      INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)
+  endif()
   FOREACH(HDR ${MNN_EXPR_PUB_HDRS})
     SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/expr/ )
   ENDFOREACH()
diff --git a/README.md b/README.md
index 7390a09f..11fccb33 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,17 @@ Interpreter consists of Engine and Backends. The former is responsible for the l
 
 Scan the following QR codes to join Dingtalk discussion group. The group discussions are predominantly Chinese. But we welcome and will help English speakers.
 
-See https://www.yuque.com/mnn/cn/feedback for dingtalk group barcodes.
+Group #1 (Full):
+
+<img src="doc/DingTalkQR1.png" height="256"/>
+
+Group #2 (Full):
+
+<img src="doc/DingTalkQR2.png" height="256"/>
+
+Group #3:
+
+<img src="doc/DingTalkQR3.png" height="256"/>
 
 ## License
 Apache 2.0
diff --git a/README_CN.md b/README_CN.md
index a857ad24..7ed22ca1 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -56,7 +56,19 @@ Converter由Frontends和Graph Optimize构成。前者负责支持不同的训练
 Interpreter由Engine和Backends构成。前者负责模型的加载、计算图的调度；后者包含各计算设备下的内存分配、Op实现。在Engine和Backends中，MNN应用了多种优化方案，包括在卷积和反卷积中应用Winograd算法、在矩阵乘法中应用Strassen算法、低精度计算、Neon优化、手写汇编、多线程优化、内存复用、异构计算等。
 
 ##  社区交流与反馈
-扫描二维码加入钉钉讨论群，见：https://www.yuque.com/mnn/cn/feedback
+扫描二维码加入钉钉讨论群。
+
+一群（已满）：
+
+<img src="doc/DingTalkQR1.png" height="256"/>
+
+二群（已满）：
+
+<img src="doc/DingTalkQR2.png" height="256"/>
+
+三群：
+
+<img src="doc/DingTalkQR3.png" height="256"/>
 
 ## License
 Apache 2.0
diff --git a/express/CMakeLists.txt b/express/CMakeLists.txt
index 0b2c3ffd..190c18ca 100644
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@@ -18,6 +18,10 @@ IF(MNN_SEP_BUILD)
         add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
     endif()
     target_link_libraries(MNN_Express MNN)
+    install(TARGETS MNN_Express
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib
+        )
 ELSE()
     add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS})
 ENDIF()
diff --git a/express/Executor.cpp b/express/Executor.cpp
index 572b9d10..cef13be7 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -536,6 +536,7 @@ ErrorCode Executor::ComputeCache::compute() {
     if (mShapeDirty) {
         auto code = resize();
         if (NO_ERROR != code) {
+            mShapeDirty = true;
             return code;
         }
     }
diff --git a/express/Expr.cpp b/express/Expr.cpp
index 201dda10..aca1e571 100644
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@@ -116,6 +116,9 @@ Variable::Info* Expr::outputInfo(int index) const {
 void Expr::_addLinkForInputs(EXPRP expr) {
     auto inputs = expr->inputs();
     for (int i=0; i<inputs.size(); ++i) {
+        if (inputs[i].get() == nullptr) {
+            continue;
+        }
         bool findEmpty = false;
         auto inputExpr = inputs[i]->mFrom;
         for (int j=0; j<inputExpr->mTo.size(); ++j) {
@@ -290,6 +293,10 @@ bool Expr::requireInfo() {
     }
     for (int i = 0; i < mInputs.size(); ++i) {
         auto& v  = mInputs[i];
+        if (v->getInfo()->size == 0) {
+            // zero shape
+            continue;
+        }
         if (mInside->mReq.shapeNeedContent[i]) {
             // For shape need content, the content must not be nullptr
             auto ptr = v->readInternal(true);
@@ -338,6 +345,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
         return;
     }
     for (auto input : old->inputs()) {
+        if (input.get() == nullptr) {
+            continue;
+        }
         for (int j=0; j<input->mFrom->mTo.size(); ++j) {
             auto ref = input->mFrom->mTo[j].lock();
             if (ref.get() == old.get()) {
@@ -346,6 +356,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
         }
     }
     for (auto input : from->inputs()) {
+        if (input.get() == nullptr) {
+            continue;
+        }
         bool hasSet = false;
         for (int j=0; j<input->mFrom->mTo.size(); ++j) {
             auto ref = input->mFrom->mTo[j].lock();
@@ -567,6 +580,9 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
         return;
     }
     for (int i = 0; i < expr->inputs().size(); ++i) {
+        if (expr->inputs()[i].get() == nullptr) {
+            continue;
+        }
         visit(expr->inputs()[i]->mFrom, before, after);
     }
     after(expr);
@@ -721,6 +737,9 @@ void Expr::visitOutputs(const std::function<bool(EXPRP, int)>& visit) {
         bool recurse = false;
         auto inputs = expr->inputs();
         for (int i=0; i<inputs.size(); ++i) {
+            if (inputs[i].get() == nullptr) {
+                continue;
+            }
             if (inputs[i]->mFrom.get() == this) {
                 recurse = recurse || visit(expr, i);
             }
@@ -924,6 +943,10 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
         op->name = expr->name();
         op->inputIndexes.resize(expr->inputs().size());
         for (int i = 0; i < op->inputIndexes.size(); ++i) {
+            if (expr->inputs()[i] == nullptr) {
+                op->inputIndexes[i] = -1;
+                continue;
+            }
             auto inputExpr = expr->inputs()[i]->expr();
             op->inputIndexes[i] = varIndexInfo[inputExpr.first] + inputExpr.second;
         }
diff --git a/express/MathOp.cpp b/express/MathOp.cpp
index a6f83919..db97e0e9 100644
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@@ -1119,6 +1119,14 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape) {
     return (Variable::create(Expr::create(std::move(op), {indices, updates, shape})));
 }
 
+VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->main.type  = OpParameter_NONE;
+    op->type       = OpType_ScatterNd;
+    op->main.value = nullptr;
+    return (Variable::create(Expr::create(std::move(op), {indices, updates, shape, input})));
+}
+
 VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis) {
     std::unique_ptr<OpT> op(new OpT);
     op->type                       = OpType_OneHot;
diff --git a/express/NeuralNetWorkOp.cpp b/express/NeuralNetWorkOp.cpp
index d45c969c..01f38a97 100644
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@@ -581,6 +581,22 @@ VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided, int32_t begin
     op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
     return (Variable::create(Expr::create(op.get(), {input, begin, end, strided})));
 }
+
+VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write, int32_t beginMask,
+                        int32_t endMask, int32_t ellipsisMask, int32_t newAxisMask, int32_t shrinkAxisMask) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type                        = OpType_StridedSlice;
+    op->main.type                   = OpParameter_StridedSliceParam;
+    op->main.value                  = new StridedSliceParamT;
+
+    op->main.AsStridedSliceParam()->T = DataType_DT_FLOAT;
+    op->main.AsStridedSliceParam()->beginMask      = beginMask;
+    op->main.AsStridedSliceParam()->endMask        = endMask;
+    op->main.AsStridedSliceParam()->ellipsisMask   = ellipsisMask;
+    op->main.AsStridedSliceParam()->newAxisMask    = newAxisMask;
+    op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
+    return (Variable::create(Expr::create(op.get(), {input, begin, end, strided, write})));
+}
 /*Transposes x.
 Args:
 x: A variable.
@@ -1830,5 +1846,57 @@ VARP _Where(VARP x) {
     return (Variable::create(Expr::create(std::move(op), {x})));
 }
 
+VARP _Sort(VARP x, int axis, bool arg, bool descend) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_TopKV2;
+    op->main.type = OpParameter_TopKV2;
+    auto topk = new TopKV2T;
+    topk->largest = descend;
+    op->main.value = topk;
+    auto shape = x->getInfo()->dim;
+    axis = axis < 0 ? shape.size() + axis : axis;
+    int k = x->getInfo()->dim[axis];
+    std::vector<VARP> inputs {x, _Scalar(k)};
+    if (axis + 1 != shape.size()) {
+        inputs.push_back(_Scalar(axis));
+    }
+    auto expr = Expr::create(op.get(), inputs, 2);
+    return Variable::create(expr, arg);
+}
+
+VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& region, const std::vector<int>& shape) {
+    std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+    op->type = OpType_Raster;
+    auto extra = new ExtraT;
+    // set shape
+    std::unique_ptr<AttributeT> shapeAttr(new AttributeT);
+    shapeAttr->key = "shape";
+    shapeAttr->list.reset(new ListValueT);
+    shapeAttr->list->i = shape;
+    extra->attr.push_back(std::move(shapeAttr));
+    // set region
+    std::unique_ptr<AttributeT> regionAttr(new AttributeT);
+    regionAttr->key = "region";
+    regionAttr->list.reset(new ListValueT);
+    regionAttr->list->i = region;
+    extra->attr.push_back(std::move(regionAttr));
+    op->main.type = OpParameter_Extra;
+    op->main.value = extra;
+    return (Variable::create(Expr::create(std::move(op), vars)));
+}
+
+VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold, float scoreThreshold) {
+    std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+    op->type = OpType_NonMaxSuppressionV2;
+    std::vector<VARP> vars {boxes, scores, _Scalar(maxDetections)};
+    if (iouThreshold >= 0) {
+        vars.push_back(_Scalar(iouThreshold));
+    }
+    if (scoreThreshold >= 0) {
+        vars.push_back(_Scalar(scoreThreshold));
+    }
+    return (Variable::create(Expr::create(std::move(op), vars)));
+}
+
 } // namespace Express
 } // namespace MNN
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index 17f80b35..af547017 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -166,7 +166,8 @@ public:
         return mModule->onForward(inputs);
     }
     virtual Module* clone(CloneContext* ctx) const override {
-        NetModule* module(new NetModule(mModule, mInfo));
+        std::shared_ptr<Module> submodule(mModule->clone(ctx));
+        NetModule* module(new NetModule(submodule, mInfo));
         return this->cloneBaseTo(ctx, module);
     }
     const Module::Info* info() const {
@@ -223,9 +224,9 @@ static void _loadInputs(Module::Info* info, const std::vector<std::string>& inpu
     }
 }
 
-Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config) {
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config) {
     // Check if runtime is valid
-    if (nullptr != rtMgr && rtMgr->getRuntimeInfo().first.empty()) {
+    if (nullptr != _rtMgr && _rtMgr->getRuntimeInfo().first.empty()) {
         MNN_ERROR("Invalid runtime\n");
         return nullptr;
     }
@@ -269,6 +270,17 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
 #endif // MNN_INTERNAL_ENABLED
 
     std::shared_ptr<Info> info(new Info);
+    auto rtMgr = _rtMgr;
+    Module::Config defaultConfig;
+    if (nullptr == config) {
+        config = &defaultConfig;
+    }
+    if(nullptr == rtMgr && config->backend != nullptr) {
+        ScheduleConfig sche_config;
+        sche_config.type = config->backend->type;
+        sche_config.backendConfig = config->backend->config;
+        rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config));
+    }
     if ((!inputs.empty()) && (!outputs.empty())) {
         _loadInputs(info.get(), inputs, net);
         info->runTimeManager = rtMgr;
diff --git a/express/module/NMSModule.hpp b/express/module/NMSModule.hpp
index 5d5cbf4b..7b0b92af 100644
--- a/express/module/NMSModule.hpp
+++ b/express/module/NMSModule.hpp
@@ -16,7 +16,7 @@ public:
         // Do nothing
     }
     virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
-    static NMSModule* create(const Op* op);
+    MNN_PUBLIC static NMSModule* create(const Op* op);
 
 private:
     NMSModule(){}
diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp
index 03f3b6b0..4c0af907 100644
--- a/include/MNN/ImageProcess.hpp
+++ b/include/MNN/ImageProcess.hpp
@@ -61,6 +61,7 @@ public:
 
         /** edge wrapper */
         Wrap wrap = CLAMP_TO_EDGE;
+        bool draw = false;
     };
 
 public:
@@ -148,6 +149,18 @@ public:
     void setPadding(uint8_t value) {
         mPaddingValue = value;
     }
+    /**
+     * @brief draw color to regions of img.
+     * @param img  the image to draw.
+     * @param w  the image's width.
+     * @param h  the image's height.
+     * @param c  the image's channel.
+     * @param regions  the regions to draw, size is [num * 3] contain num x { y, xl, xr }
+     * @param num  regions num
+     * @param color  the color to draw.
+     * @return void.
+     */
+    void draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color);
 private:
     ImageProcess(const Config& config);
     Matrix mTransform;
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index 088ab8da..a30c9d06 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -154,7 +154,7 @@ public:
      * @param keySize        depercerate, for future use.
      */
     void setCacheFile(const char* cacheFile, size_t keySize = 128);
-    
+
     /**
      * @brief The API shoud be called after last resize session.
      * If resize session generate new cache info, try to rewrite cache file.
@@ -357,6 +357,12 @@ public:
      */
     const char* bizCode() const;
 
+    /**
+     * @brief get model UUID
+     * @return Model UUID.
+     */
+    const char* uuid() const;
+
 private:
     static Interpreter* createFromBufferInternal(Content* net);
 
diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp
index f3be7ec7..34e731db 100644
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@@ -70,7 +70,7 @@ public:
         return mDebug.get();
     }
     struct Cache;
-    class RuntimeManager {
+    class MNN_PUBLIC RuntimeManager {
     public:
         ~RuntimeManager();
         /**
diff --git a/include/MNN/expr/MathOp.hpp b/include/MNN/expr/MathOp.hpp
index d4b4e93a..7cf9fc0f 100644
--- a/include/MNN/expr/MathOp.hpp
+++ b/include/MNN/expr/MathOp.hpp
@@ -124,6 +124,7 @@ MNN_PUBLIC VARP _ArgMin(VARP input, int axis = 0);
 MNN_PUBLIC VARP _BatchMatMul(VARP x, VARP y, bool adj_x = false, bool adj_y = false);
 MNN_PUBLIC VARP _UnravelIndex(VARP indices, VARP dims);
 MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape);
+MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input);
 MNN_PUBLIC VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis = -1);
 MNN_PUBLIC VARP _BroadcastTo(VARP a, VARP shape);
 MNN_PUBLIC VARP _LinSpace(VARP start, VARP stop, VARP num);
diff --git a/include/MNN/expr/NeuralNetWorkOp.hpp b/include/MNN/expr/NeuralNetWorkOp.hpp
index 567d3892..d019b851 100644
--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@@ -63,8 +63,11 @@ MNN_PUBLIC VARP _Softsign(VARP features);
 MNN_PUBLIC std::vector<VARP> _Split(VARP value, INTS size_splits, int axis = 0);
 MNN_PUBLIC VARP _Slice(VARP x, VARP starts, VARP sizes);
 MNN_PUBLIC VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided,
-                                      int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
-                                      int32_t newAxisMask, int32_t shrinkAxisMask);
+                              int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
+                              int32_t newAxisMask, int32_t shrinkAxisMask);
+MNN_PUBLIC VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write,
+                                   int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
+                                   int32_t newAxisMask, int32_t shrinkAxisMask);
 MNN_PUBLIC VARP _Concat(VARPS values, int axis);
 MNN_PUBLIC VARP _Convert(VARP input, Dimensionformat format);
 MNN_PUBLIC VARP _Transpose(VARP x, INTS perm);
@@ -155,6 +158,9 @@ MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1);
 MNN_PUBLIC std::vector<VARP> _TopKV2(VARP input0, VARP input1);
 MNN_PUBLIC VARP _ImageProcess(VARP input, CV::ImageProcess::Config config, CV::Matrix matrix, int oh, int ow, int oc, int dtype, uint8_t padVal = 0);
 MNN_PUBLIC VARP _Where(VARP x);
+MNN_PUBLIC VARP _Sort(VARP x, int axis = -1, bool arg = false, bool descend = false);
+MNN_PUBLIC VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& regions, const std::vector<int>& shape);
+MNN_PUBLIC VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold = -1, float scoreThreshold = -1);
 
 } // namespace Express
 } // namespace MNN
diff --git a/package_scripts/linux/build_whl.sh b/package_scripts/linux/build_whl.sh
index 1157cb48..ae3c04b7 100755
--- a/package_scripts/linux/build_whl.sh
+++ b/package_scripts/linux/build_whl.sh
@@ -21,13 +21,13 @@ done
 rm -rf $path && mkdir -p $path
 PACKAGE_PATH=$(realpath $path)
 
-CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
 if [ ! -z $opencl ]; then
     CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
 fi
 rm -rf pymnn_build && mkdir pymnn_build
 pushd pymnn_build
-cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j24
 popd
 
 pushd pymnn/pip_package
diff --git a/package_scripts/mac/build_whl.sh b/package_scripts/mac/build_whl.sh
index 82356967..a24552a1 100755
--- a/package_scripts/mac/build_whl.sh
+++ b/package_scripts/mac/build_whl.sh
@@ -19,25 +19,27 @@ while getopts "o:p:v:b" opt; do
   esac
 done
 
+export MACOSX_DEPLOYMENT_TARGET=10.11
+
 ./schema/generate.sh
 rm -rf $path && mkdir -p $path
 PACKAGE_PATH=$(realpath $path)
 
-CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
 if [ ! -z $opencl ]; then
     CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
 fi
 
 rm -rf pymnn_build && mkdir pymnn_build
 pushd pymnn_build
-cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j8
 popd
 
 pushd pymnn/pip_package
 echo -e "__version__ = '$mnn_version'" > MNN/version.py
 rm -rf build && mkdir build
 rm -rf dist && mkdir dist
-if [ -z $python_versions ]; then
+if [ -z "$python_versions" ]; then
   python build_wheel.py --version $mnn_version
 else
   for env in $python_versions; do
diff --git a/package_scripts/win/build_bridge.ps1 b/package_scripts/win/build_bridge.ps1
index ef32b7f7..465efc7f 100644
--- a/package_scripts/win/build_bridge.ps1
+++ b/package_scripts/win/build_bridge.ps1
@@ -1,66 +1,63 @@
 # MNNPyBridge
-#  |-- Debug
-#  |     |--- MD
-#  |     |--- MT
-#  |     |--- Static
-#  |
-#  |-- Release
-#        |--- MD
-#        |--- MT
-#        |--- Static
+#   |-- include
+#   |-- wrapper
+#   |-- test (Release + Dynamic + MD)
+#        |-- x64
+#        |-- x86
+#   |-- lib
+#        |-- x64
+#        |    |-- (Debug/Release x Dynamic/Static x MD/MT)
+#        |
+#        |-- x86
+#             |-- (Debug/Release x Dynamic/Static x MD/MT)
 
 Param(
     [Parameter(Mandatory=$true)][String]$version,
     [Parameter(Mandatory=$true)][String]$pyc_env,
     [Parameter(Mandatory=$true)][String]$mnn_path,
+    [Parameter(Mandatory=$true)][String]$python_path,
+    [Parameter(Mandatory=$true)][String]$numpy_path,
     [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$train_api,
     [Switch]$x86
 )
 
-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
 }
 
 $erroractionpreference = "stop"
+mkdir -p $path -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
-$PACKAGE_LIB_PATH = "$PACKAGE_PATH\lib"
-if ($x86) {
-    $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x86"
-} else {
-    $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x64"
-}
-$MNN_PACKAGE_PATH = $(Resolve-Path $mnn_path).Path
-
-pushd pymnn\3rd_party
-Remove-Item MNN -Recurse -ErrorAction Ignore
-mkdir -p MNN\lib
-cp -r $MNN_PACKAGE_PATH\* MNN\lib
-cp -r ..\..\include MNN
-popd
+$arch = $(If($x86) {"x86"} Else {"x64"})
+$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$arch"
+$TEST_TOOL_PATH = "$PACKAGE_PATH/test/$arch"
 
 #clear and create package directory
 powershell ./schema/generate.ps1
 pushd $PACKAGE_PATH
-Remove-Item include -Recurse -ErrorAction Ignore
-Remove-Item wrapper -Recurse -ErrorAction Ignore
-mkdir -p include
-mkdir -p wrapper
-mkdir -p $PACKAGE_LIB_PATH\Debug\MD -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Debug\MT -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Debug\Static -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\MD -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\MT -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\Static -ErrorAction SilentlyContinue
+Remove-Item -Path include, wrapper -Recurse -ErrorAction Ignore
+mkdir -p include, wrapper
+popd
+Remove-Item -Path $PACKAGE_LIB_PATH, $TEST_TOOL_PATH -Recurse -ErrorAction Ignore
+mkdir -p $PACKAGE_LIB_PATH, $TEST_TOOL_PATH
+pushd $PACKAGE_LIB_PATH
+mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
 popd
 
 # assume $PACKAGE_PATH exist
@@ -71,8 +68,16 @@ cp -r pymnn\pip_package\MNN pymnn_pyc_tmp
 pushd pymnn_pyc_tmp
 Remove-Item MNN -Include __pycache__ -Recurse
 pushd MNN
-rm -r -force tools
-(Get-Content __init__.py).replace('from . import tools', '') | Set-Content __init__.py
+function Remove([String]$module) {
+  rm -r -force $module
+  (Get-Content __init__.py).replace("from . import $module", "") | Set-Content __init__.py
+}
+Remove "tools"
+if (!$train_api) {
+  Remove "data"
+  Remove "optim"
+}
+
 popd
 popd
 conda activate $pyc_env
@@ -83,59 +88,108 @@ Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'"
 cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force
 rm -r -force pymnn_pyc_tmp
 
-$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_TRAIN_API=ON"
+$mnn_path = $(Resolve-Path $mnn_path).Path
+$python_path = $(Resolve-Path $python_path).Path
+$numpy_path = $(Resolve-Path $numpy_path).Path
+
+$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_BUILD_TEST=OFF"
+if ($train_api) {
+  $CMAKE_ARGS = "$CMAKE_ARGS -DPYMNN_TRAIN_API=ON"
+}
+$CMAKE_ARGS = "$CMAKE_ARGS -Dmnn_path=$mnn_path -Dpython_path=$python_path -Dnumpy_path=$numpy_path"
 
 Remove-Item pymnn_build -Recurse -ErrorAction Ignore
 mkdir pymnn_build
 pushd pymnn_build
 
-##### Debug/MT ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MT
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MT
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
-#rm mnnpybridge.*
+function exist([String]$build_type, [String]$lib_type, [String]$crt_type) {
+  function _exist([String]$lib) {
+    $lib_dir = "$lib/lib/$arch/$build_type/$lib_type/$crt_type"
+    return $((Test-Path -Path $lib_dir) -and ((Get-ChildItem -Path "$lib_dir/*" -Include "*.lib").Count -ne 0))
+  }
+  return $((_exist $mnn_path) -and (_exist $python_path) -and (_exist $numpy_path))
+}
 
-##### Debug/MD ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MD
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MD
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
-#rm mnnpybridge.*
+function log([String]$msg) {
+    echo "================================"
+    echo "Build MNNPyBridge $msg"
+    echo "================================"
+}
 
-##### Debug/Static ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static
-#rm mnnpybridge.*
+##### Debug/Dynamic/MT ####
+if (exist Debug Dynamic MT) {
+  log "Debug/Dynamic/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
+  rm mnnpybridge.*
+}
 
-##### Release/MT ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MT
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MT
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MT
-#rm mnnpybridge.*
+##### Debug/Dynamic/MD ####
+if (exist Debug Dynamic MD) {
+  log "Debug/Dynamic/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
+  rm mnnpybridge.*
+}
 
-##### Release/MD ####
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
-Retry "ninja" 2
-cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MD
-cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MD
-cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MD
-rm mnnpybridge.*
+##### Debug/Static/MT ####
+if (exist Debug Static MT) {
+  log "Debug/Static/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MT
+  rm mnnpybridge.*
+}
 
-##### Release/Static ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static
+##### Debug/Static/MD ####
+if (exist Debug Static MD) {
+  log "Debug/Static/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MD
+  rm mnnpybridge.*
+}
+
+##### Release/Dynamic/MT ####
+if (exist Release Dynamic MT) {
+  log "Release + MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
+  rm mnnpybridge.*
+}
+
+##### Release/Dynamic/MD ####
+if (exist Release Dynamic MD) {
+  log "Release + MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
+  #cp mnnpybridge_test.exe $TEST_TOOL_PATH
+  #cp $mnn_path/lib/$arch/Release/MD/MNN.dll $TEST_TOOL_PATH
+  #cp $python_path/lib/$arch/Release/MD/python.dll $TEST_TOOL_PATH
+  #cp $numpy_path/lib/$arch/Release/MD/numpy_python.dll $TEST_TOOL_PATH
+  rm mnnpybridge.*
+}
+
+##### Release/Static/MT ####
+if (exist Release Static MT) {
+  log "Release/Static/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MT
+  rm mnnpybridge.*
+}
+
+##### Release/Static/MD ####
+if (exist Release Static MD) {
+  log "Release/Static/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MD
+  rm mnnpybridge.*
+}
 
 popd
\ No newline at end of file
diff --git a/package_scripts/win/build_lib.ps1 b/package_scripts/win/build_lib.ps1
index a899ee1c..ae24da31 100644
--- a/package_scripts/win/build_lib.ps1
+++ b/package_scripts/win/build_lib.ps1
@@ -1,49 +1,47 @@
 # MNN
-#  |-- Debug
-#  |     |--- MD
-#  |     |--- MT
-#  |     |--- Static
-#  |
-#  |-- Release
-#        |--- MD
-#        |--- MT
-#        |--- Static
+#  |-- include
+#  |-- lib
+#       |-- Debug
+#       |     |--- Dynamic
+#       |     |      |--- MD
+#       |     |      |--- MT
+#       |     |
+#       |     |--- Static
+#       |            |--- MD
+#       |            |--- MT
+#       |
+#       |-- Release
+#             |--- Dynamic
+#             |      |--- MD
+#             |      |--- MT
+#             |
+#             |--- Static
+#                    |--- MD
+#                    |--- MT
+#
 Param(
     [Parameter(Mandatory=$true)][String]$path,
-    [String]$backends
+    [String]$backends,
+    [Switch]$x86
 )
 
-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
-}
-
 $erroractionpreference = "stop"
-Remove-Item $path -Recurse -ErrorAction Ignore
-mkdir -p $path
+New-Item -Path $path -ItemType Directory -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
+$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$(If ($x86) {"x86"} Else {"x64"})"
+Remove-Item -Path $PACKAGE_LIB_PATH -Recurse -ErrorAction Ignore
+mkdir -p $PACKAGE_LIB_PATH
 
 #clear and create package directory
 powershell ./schema/generate.ps1
-pushd $PACKAGE_PATH
-mkdir -p Debug\MD
-mkdir -p Debug\MT
-mkdir -p Debug\Static
-mkdir -p Release\MD
-mkdir -p Release\MT
-mkdir -p Release\Static
+Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore
+cp -r include $PACKAGE_PATH
+cp -r tools/cv/include/cv $PACKAGE_PATH/include
+pushd $PACKAGE_LIB_PATH
+mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
 popd
 
-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON"
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
 if ($backends -ne $null) {
     Foreach ($backend in $backends.Split(",")) {
         if ($backend -eq "opencl") {
@@ -58,53 +56,83 @@ Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
 pushd build
 
-##### Debug/MT ####
+function log([String]$msg) {
+    echo "================================"
+    echo "Build MNN (CPU $backends) $msg"
+    echo "================================"
+}
+
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja MNN") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
+}
+
+##### Debug/Dynamic/MT ####
+log "Debug/Dynamic/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\MT
-cp MNN.dll $PACKAGE_PATH\Debug\MT
-cp MNN.pdb $PACKAGE_PATH\Debug\MT
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MT
 rm MNN.*
 
-##### Debug/MD ####
+##### Debug/Dynamic/MD ####
+log "Debug/Dynamic/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\MD
-cp MNN.dll $PACKAGE_PATH\Debug\MD
-cp MNN.pdb $PACKAGE_PATH\Debug\MD
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MD
 rm MNN.*
 
-##### Debug/Static ####
+##### Debug/Static/MT ####
+log "Debug/Static/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\Static
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MT
 rm MNN.*
 
-##### Release/MT ####
+##### Debug/Static/MD ####
+log "Debug/Static/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\MT
-cp MNN.dll $PACKAGE_PATH\Release\MT
-cp MNN.pdb $PACKAGE_PATH\Release\MT
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MD
 rm MNN.*
 
-##### Release/MD ####
+##### Release/Dynamic/MT ####
+log "Release/Dynamic/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\MD
-cp MNN.dll $PACKAGE_PATH\Release\MD
-cp MNN.pdb $PACKAGE_PATH\Release\MD
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
 rm MNN.*
 
-##### Release/Static ####
+##### Release/Dynamic/MD ####
+log "Release/Dynamic/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\Static
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
+rm MNN.*
+
+##### Release/Static/MT ####
+log "Release/Static/MT"
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MT
+
+##### Release/Static/MD ####
+log "Release/Static/MD"
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MD
 
 popd
\ No newline at end of file
diff --git a/package_scripts/win/build_tools.ps1 b/package_scripts/win/build_tools.ps1
index d480cf17..5e3cd38c 100644
--- a/package_scripts/win/build_tools.ps1
+++ b/package_scripts/win/build_tools.ps1
@@ -1,5 +1,6 @@
 Param(
     [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$dynamic_link,
     [String]$backends,
     [Switch]$build_all,
     [Switch]$build_train, # MNN_BUILD_TRAIN
@@ -23,20 +24,6 @@ if ($build_all) {
     $build_demo = $true
 }
 
-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
-}
-
 $erroractionpreference = "stop"
 Remove-Item $path -Recurse -ErrorAction Ignore
 mkdir -p $path
@@ -44,7 +31,12 @@ $TOOLS_PATH = $(Resolve-Path $path).Path
 
 powershell ./schema/generate.ps1
 
-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF"
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
+if ($dynamic_link) {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=ON"
+} else {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON"
+}
 if ($build_train) {
     $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_TRAIN=ON"
 }
@@ -59,6 +51,11 @@ if ($build_evaluation) {
 }
 if ($build_converter) {
     $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_CONVERTER=ON"
+    if ($dynamic_link) {
+        $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=ON"
+    } else {
+        $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=OFF"
+    }
 }
 if ($build_benchmark) {
     $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_BENCHMARK=ON"
@@ -83,37 +80,37 @@ Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
 pushd build
 
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
+}
+
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS  .."
-Retry "ninja" 2
+Build "cmake -G Ninja $CMAKE_ARGS  .."
 
-$PRODUCTS = ""
-if ($build_train) {
-    $PRODUCTS = "$PRODUCTS transformer.out.exe train.out.exe rawDataTransform.out.exe dataTransformer.out.exe runTrainDemo.out.exe"
-}
-if ($build_tools) {
-    $PRODUCTS = "$PRODUCTS MNNV2Basic.out.exe mobilenetTest.out.exe backendTest.out.exe testModel.out.exe testModelWithDescrisbe.out.exe getPerformance.out.exe checkInvalidValue.out.exe timeProfile.out.exe"
-}
-if ($build_quantools) {
-    $PRODUCTS = "$PRODUCTS quantized.out.exe quantized_model_optimize.out.exe"
-}
-if ($build_evaluation) {
-    $PRODUCTS = "$PRODUCTS classficationTopkEval.out.exe"
-}
-if ($build_converter) {
-    $PRODUCTS = "$PRODUCTS MNNDump2Json.exe MNNConvert.exe"
-}
-if ($build_benchmark) {
-    $PRODUCTS = "$PRODUCTS benchmark.out.exe benchmarkExprModels.out.exe"
-}
-if ($build_test) {
-    $PRODUCTS = "$PRODUCTS run_test.out.exe"
-}
-if ($build_demo) {
-    $PRODUCTS = "$PRODUCTS pictureRecognition.out.exe pictureRotate.out.exe multiPose.out.exe segment.out.exe expressDemo.out.exe transformerDemo.out.exe rasterDemo.out.exe"
+$PRODUCTS = $(Get-ChildItem -Path . -Include "*.exe" -Name)
+if ($dynamic_link) {
+    $PRODUCTS = "$PRODUCTS MNN.dll"
+    if ($build_converter) {
+        $PRODUCTS = "$PRODUCTS ./3rd_party/protobuf/cmake/libprotobuf.dll"
+    }
 }
 
-Foreach ($PRODUCT in $PRODUCTS.Split(" ")) {
+Foreach ($PRODUCT in $PRODUCTS.Trim().Split()) {
     Invoke-Expression "cp $PRODUCT $TOOLS_PATH"
 }
 
diff --git a/package_scripts/win/build_whl.ps1 b/package_scripts/win/build_whl.ps1
index ec2f5fc3..4ed10229 100644
--- a/package_scripts/win/build_whl.ps1
+++ b/package_scripts/win/build_whl.ps1
@@ -6,25 +6,28 @@ Param(
     [Switch]$x86
 )
 
-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    exit 1
 }
 
 $erroractionpreference = "stop"
 $python_versions = $pyenvs.Split(",")
 
-Remove-Item $path -Recurse -ErrorAction Ignore
-mkdir -p $path
+New-Item -Path $path -ItemType Directory -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
 $ARGS = "--version $version"
 if ($x86) {
@@ -37,7 +40,7 @@ powershell ./schema/generate.ps1
 Remove-Item pymnn_build -Recurse -ErrorAction Ignore
 mkdir pymnn_build
 pushd pymnn_build
-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
 if ($backends -ne $null) {
     Foreach($backend in $backends.Split(",")) {
         if ($backend -eq "opencl") {
@@ -47,8 +50,7 @@ if ($backends -ne $null) {
         }
     }
 }
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
-Retry "ninja MNN MNNTrain MNNConvert" 2
+Build "cmake -G Ninja $CMAKE_ARGS .." "ninja MNN MNNTrain MNNConvert MNNOpenCV"
 popd
 
 pushd pymnn/pip_package
@@ -59,12 +61,15 @@ mkdir dist
 mkdir build
 
 if ($pyenvs -eq $null) {
-    Retry "python build_wheel.py $ARGS" 2
+    Invoke-Expression "python build_wheel.py $ARGS"
 } else {
     Foreach ($env in $pyenvs.Split(",")) {
         Invoke-Expression "conda activate $env"
-        Retry "python build_wheel.py $ARGS" 2
-        Invoke-Expression "conda deactivate"
+        Invoke-Expression "python build_wheel.py $ARGS"
+        conda deactivate
+        if ($LastExitCode -ne 0) {
+            exit 1
+        }
     }
 }
 
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index e7c6a477..69882388 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -748,6 +748,7 @@
 		EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; };
 		EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; };
 		EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */ = {isa = PBXBuildFile; fileRef = F41497D6278D8A21004A363A /* RuntimeAttr.hpp */; };
 		F4FB5AD7274E6CC100EAF0C1 /* MNNAESCipher.h in Headers */ = {isa = PBXBuildFile; fileRef = F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */; };
 		F4FB5AD8274E6CC100EAF0C1 /* ModelAuth.mm in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */; };
 		F4FB5AD9274E6CC100EAF0C1 /* MNNAESCipher.m in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */; };
@@ -1542,6 +1543,7 @@
 		EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = "<group>"; };
 		EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = "<group>"; };
 		EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = "<group>"; };
+		F41497D6278D8A21004A363A /* RuntimeAttr.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = RuntimeAttr.hpp; sourceTree = "<group>"; };
 		F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAESCipher.h; sourceTree = "<group>"; };
 		F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ModelAuth.mm; sourceTree = "<group>"; };
 		F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = MNNAESCipher.m; sourceTree = "<group>"; };
@@ -1679,6 +1681,7 @@
 		48593FB423A89B2F0069452A /* express */ = {
 			isa = PBXGroup;
 			children = (
+				F41497D6278D8A21004A363A /* RuntimeAttr.hpp */,
 				489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */,
 				48C84B6F250F711600EE7666 /* module */,
 				48FA474C23AA136300172C3B /* MergeOptimizer.cpp */,
@@ -2951,6 +2954,7 @@
 				92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
 				4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
 				48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
+				F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
 				92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
 				489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
 				92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
@@ -2985,6 +2989,7 @@
 			buildConfigurationList = 0F1465BF1FA18D1000F9860A /* Build configuration list for PBXNativeTarget "MNN" */;
 			buildPhases = (
 				0F1465B41FA18D1000F9860A /* Headers */,
+				F48DED4627742886004B8DB0 /* ShellScript */,
 				0F1465B21FA18D1000F9860A /* Sources */,
 				0F1465B31FA18D1000F9860A /* Frameworks */,
 				0F1465B51FA18D1000F9860A /* Resources */,
@@ -3091,6 +3096,23 @@
 			shellPath = /bin/sh;
 			shellScript = "\necho \"==========\"\necho ${TARGET_NAME}\necho ${PROJECT_FILE_PATH}\necho ${TARGET_BUILD_DIR}\n\ntouch ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib\ncp ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib ${TARGET_BUILD_DIR}/Playground.app/\n";
 		};
+		F48DED4627742886004B8DB0 /* ShellScript */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+			);
+			outputFileListPaths = (
+			);
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "# Type a script or drag a script file from your workspace to insert its path.\nMNN_REVISION=`git rev-parse HEAD`\necho \"#define MNN_REVISION \\\"${MNN_REVISION}\\\"\" > ${SRCROOT}/../../include/MNN/VCS.h\n";
+		};
 /* End PBXShellScriptBuildPhase section */
 
 /* Begin PBXSourcesBuildPhase section */
@@ -3808,7 +3830,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = 6T3QR3X696;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -3854,7 +3876,7 @@
 				METAL_LIBRARY_FILE_BASE = mnn;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
+				PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@@ -3875,7 +3897,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -3919,7 +3941,7 @@
 				MACH_O_TYPE = staticlib;
 				METAL_LIBRARY_FILE_BASE = mnn;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
+				PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@@ -3938,7 +3960,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
@@ -3963,7 +3985,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
diff --git a/project/ios/Playground/AppDelegate.mm b/project/ios/Playground/AppDelegate.mm
index b0b37695..f01ffb6e 100644
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@@ -9,37 +9,50 @@
 #import "AppDelegate.h"
 #import "MNNTestSuite.h"
 #include <MNN/MNNForwardType.h>
+#include <MNN/Interpreter.hpp>
 #import <MNN/expr/Executor.hpp>
 #import "benchmark.h"
 
 @implementation AppDelegate
 
 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-#define UNITTEST
-#ifdef UNITTEST
-    // unittest
-    {
-        MNN::BackendConfig config;
-        // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
-        MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
-        int precisionInTestUtil =
-        getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
-        MNNTestSuite::runAll(precisionInTestUtil);
-    }
-#endif
-#ifdef BENCHMARK
-    // benchmark
-    {
-        auto bundle = CFBundleGetMainBundle();
-        auto url    = CFBundleCopyBundleURL(bundle);
-        auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
-        CFRelease(url);
-        auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
-        auto res     = std::string(cstring) + "/models";
-        CFRelease(string);
-        iosBenchAll(res.c_str());
-    }
-#endif
+//#define UNITTEST
+//#ifdef UNITTEST
+//    // unittest
+//    {
+//        MNN::BackendConfig config;
+//        // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
+//        MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
+//        int precisionInTestUtil =
+//        getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
+//        MNNTestSuite::runAll(precisionInTestUtil);
+//    }
+//#endif
+//#ifdef BENCHMARK
+//    // benchmark
+//    {
+//        auto bundle = CFBundleGetMainBundle();
+//        auto url    = CFBundleCopyBundleURL(bundle);
+//        auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
+//        CFRelease(url);
+//        auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
+//        auto res     = std::string(cstring) + "/models";
+//        CFRelease(string);
+//        iosBenchAll(res.c_str());
+//    }
+//#endif
+    auto bundle = CFBundleGetMainBundle();
+    auto url    = CFBundleCopyBundleURL(bundle);
+    auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
+    CFRelease(url);
+    auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
+    auto res     = std::string(cstring) + "/models/mobilenet_v2_auth.mnn";
+    
+    
+    MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str());
+    MNN::ScheduleConfig config;
+    interpreter->createSession(config);
+    
     return YES;
 }
 
diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt
index daf811c6..95bfbc1b 100644
--- a/pymnn/CMakeLists.txt
+++ b/pymnn/CMakeLists.txt
@@ -3,6 +3,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 project(mnnpybridge)
 
+# python_path / numpy_path / mnn_path
 option(DEPEND_AAPL_FMWK "use dependency library .framework instead of traditional .a/.dylib" OFF)
 option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
 option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
@@ -12,8 +13,17 @@ option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OF
 option(PYMNN_EXPR_API "MNN expr API be exposed" ON)
 option(PYMNN_NUMPY_USABLE "Build based on numpy" ON)
 option(PYMNN_TRAIN_API "MNN train API be exposed" OFF)
+option(PYMNN_INTERNAL_SERVING "Internal use only." OFF)
+
+if(PYMNN_INTERNAL_SERVING)
+    file(GLOB_RECURSE SRC   ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/monitor_service.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/verify_service.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/http_util.cc)
+else()
+    file(GLOB_RECURSE SRC   ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
+endif()
 
-file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
 if (MNN_BUILD_SHARED_LIBS)
     add_library(mnnpybridge SHARED ${SRC})
 else()
@@ -39,6 +49,11 @@ if(PYMNN_TRAIN_API)
     target_compile_definitions(mnnpybridge PRIVATE PYMNN_TRAIN_API)
 endif()
 
+if(PYMNN_INTERNAL_SERVING)
+    message(STATUS "mnnpybridge define PYMNN_INTERNAL_SERVING")
+    target_compile_definitions(mnnpybridge PRIVATE PYMNN_INTERNAL_SERVING)
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
     add_definitions(-DMNN_USE_LOGCAT)
 endif()
@@ -59,8 +74,8 @@ if(MSVC)
             endif()
         endif ()
     endforeach()
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
     SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
     SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
     SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@@ -73,20 +88,24 @@ endif()
 if(PYMNN_TRAIN_API)
     set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..)
     target_include_directories(mnnpybridge PRIVATE
-        ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer
-        ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include)
+        ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer ${MNN_DIR}/tools/train/source/nn
+        ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include ${MNN_DIR}/tools/cv/include
+        ${MNN_DIR}/express ${MNN_DIR}/express/module ${MNN_DIR}/tools)
 endif()
 
 if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
     set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party")
     set(LIB_SUBPATH "")
     if(WIN32)
-        if(NOT MNN_BUILD_SHARED_LIBS)
-            set(LIB_SUBPATH "Static")
-        elseif(MNN_WIN_RUNTIME_MT)
-            set(LIB_SUBPATH "MT")
+        if (MNN_BUILD_SHARED_LIBS)
+            set(LIB_SUBPATH "Dynamic")
         else()
-            set(LIB_SUBPATH "MD")
+            set(LIB_SUBPATH "Static")
+        endif()
+        if (MNN_WIN_RUNTIME_MT)
+            set(LIB_SUBPATH "${LIB_SUBPATH}/MT")
+        else()
+            set(LIB_SUBPATH "${LIB_SUBPATH}/MD")
         endif()
     elseif(APPLE)
         if(MNN_BUILD_SHARED_LIBS)
@@ -108,34 +127,23 @@ if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
         endif()
     endif()
 
-    target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include)
-    target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH})
-    if(APPLE AND DEPEND_AAPL_FMWK)
-        target_link_libraries(mnnpybridge PRIVATE "-framework MNN")
-        set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}")
-    else()
-        target_link_libraries(mnnpybridge PRIVATE MNN)
+    find_library(MNN NAMES MNN REQUIRED PATHS ${mnn_path}/lib/${LIB_SUBPATH})
+    if(NOT DEPEND_AAPL_FMWK)
+        target_include_directories(mnnpybridge PUBLIC ${mnn_path}/include)
     endif()
+    target_link_libraries(mnnpybridge PUBLIC ${MNN})
 
-    if(PYMNN_USE_ALINNPYTHON)
-        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include)
-        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH})
-        if(APPLE AND DEPEND_AAPL_FMWK)
-            target_link_libraries(mnnpybridge PRIVATE "-framework python")
-            set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}")
-        else()
-            target_link_libraries(mnnpybridge PRIVATE python)
-        endif()
+    find_library(python NAMES python REQUIRED PATHS ${python_path}/lib/${LIB_SUBPATH})
+    if(NOT DEPEND_AAPL_FMWK)
+            target_include_directories(mnnpybridge PUBLIC ${python_path}/include)
     endif()
+    target_link_libraries(mnnpybridge PUBLIC ${python})
     if(PYMNN_NUMPY_USABLE)
-        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include)
-        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH})
-        if(APPLE AND DEPEND_AAPL_FMWK)
-            target_link_libraries(mnnpybridge PRIVATE "-framework numpy_python")
-            set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}")
-        else()
-            target_link_libraries(mnnpybridge PRIVATE numpy_python)
+        find_library(numpy NAMES numpy_python REQUIRED PATHS ${numpy_path}/lib/${LIB_SUBPATH})
+        if(NOT DEPEND_AAPL_FMWK)
+            target_include_directories(mnnpybridge PUBLIC ${numpy_path}/include)
         endif()
+        target_link_libraries(mnnpybridge PUBLIC ${numpy})
     endif()
 else()
     target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include)
diff --git a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
index bd1e5acc..067cbca9 100644
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@@ -13,7 +13,7 @@ def inference():
     config['precision'] = 'low'
     session = interpreter.createSession()
     input_tensor = interpreter.getSessionInput(session)
-    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
+    image = cv2.imread('0000.jpg')
     #cv2 read as bgr format
     image = image[..., ::-1]
     #change to rgb format
diff --git a/pymnn/pip_package/MNN/cv/__init__.py b/pymnn/pip_package/MNN/cv/__init__.py
index 219abdfd..0eb2a512 100644
--- a/pymnn/pip_package/MNN/cv/__init__.py
+++ b/pymnn/pip_package/MNN/cv/__init__.py
@@ -1,11 +1,22 @@
 from _mnncengine.cv import *
+import _mnncengine.cv as _F
 import MNN.numpy as _np
+import MNN
 
 def __to_int(x):
     dtype = x.dtype
     if dtype == _np.int32:
         return x
     return x.astype(_np.int32)
+def resize(src, dsize=None, fx=None, fy=None, interpolation=INTER_LINEAR, code = None, mean=[], norm=[]):
+    if dsize is None and  fx is None and fy is None:
+        raise ValueError('reisze must set dsize or fx,fy.')
+    if dsize is None: dsize = [0, 0]
+    if fx is None: fx = 0
+    if fy is None: fy = 0
+    if code is None: code = -1
+    else: code = hash(code)
+    return _F.resize(src, dsize, fx, fy, interpolation, code, mean, norm)
 def copyTo(src, mask=None, dst=None):
     if mask is None: return src.copy()
     origin_dtype = src.dtype
@@ -45,3 +56,33 @@ def hconcat(src):
     return _np.concatenate(src, 1)
 def vconcat(src):
     return _np.concatenate(src, 0)
+def mean(src, mask=None):
+    if mask is not None:
+        src = copyTo(src, mask)
+    res = _np.mean(src, [0, 1])
+    if res.ndim == 0: size = 0
+    else: size = res.shape[0]
+    if size < 4:
+        res = _np.pad(res, [0, 4 - size])
+    return res
+def flip(src, flipCode):
+    h, w, c = src.shape
+    m = MNN.CVMatrix()
+    if flipCode < 0:
+        m.write([-1., 0., w-1., 0., -1., h-1.])
+    elif flipCode == 0:
+        m.write([1., 0., 0., 0., -1., h-1.])
+    else:
+        m.write([-1., 0., w-1., 0., 1., 0.])
+    return warpAffine(src, m, [w, h])
+ROTATE_90_CLOCKWISE = 0
+ROTATE_180 = 1
+ROTATE_90_COUNTERCLOCKWISE = 2
+def rotate(src, rotateMode):
+    if rotateMode == ROTATE_90_CLOCKWISE:
+        return flip(src.transpose([1, 0, 2]), 1)
+    if rotateMode == ROTATE_180:
+        return flip(src, -1)
+    if rotateMode == ROTATE_90_COUNTERCLOCKWISE:
+        return flip(src.transpose([1, 0, 2]), 0)
+    return src
diff --git a/pymnn/pip_package/MNN/expr/__init__.py b/pymnn/pip_package/MNN/expr/__init__.py
index 03fab6d2..b2bc702c 100644
--- a/pymnn/pip_package/MNN/expr/__init__.py
+++ b/pymnn/pip_package/MNN/expr/__init__.py
@@ -9,23 +9,26 @@ import _mnncengine._expr as _F
 _numpy_supported = False
 try:
     import numpy as np
-    _numpy_supported = True
+    _numpy_supported = (type(np.arange(10)) == np.ndarray)
 except Exception:
     print ("Numpy not found. Using MNN without numpy.")
+
 def scalar(value, dtype=None):
-    if dtype == _F.int:
-        value = _Int(value)
-    elif dtype == _F.float:
-        value = _Float(value)
+    if dtype is not None:
+        if dtype == _F.int or dtype == _F.uint8:
+            value = _Int(value)
+        elif dtype == _F.float:
+            value = _Float(value)
+        return _F.const([value], [], _F.NCHW, dtype)
     if type(value) == type(1):
-        res = _F.const([value], [], _F.NCHW, _F.int)
-        return res
+        return _F.const([value], [], _F.NCHW, _F.int)
     elif type(value) == type(1.):
-        res = _F.const([value], [], _F.NCHW, _F.float)
-        return res
+        return _F.const([value], [], _F.NCHW, _F.float)
     else:
         raise NotImplementedError("not supported data type for creating scalar variable")
 def _list_shape_type(object, shape=()):
+    if isinstance(object, _Sequence) and len(object) == 0:
+        return [0], _F.float
     if not isinstance(object, _Sequence):
         if type(object) in (type(1), type(1<<64)):
             dst_type = _F.int
@@ -54,6 +57,7 @@ def _can_broadcast(src_shape, dst_shape):
     return True
 def _match_dtype(x, y, dtype=None):
     def type_val(x):
+        if x is None: return -1
         if x == _F.double: return 4
         if x == _F.float: return 3
         if x == _F.int64: return 2
@@ -76,15 +80,18 @@ def _to_var(x, dtype=None):
         return scalar(x, dtype)
     # 2. numpy
     if _numpy_supported:
-        if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
-            if x.dtype.kind == 'i':
-                x = x.astype(np.int32)
-                x = _F.const(x, x.shape, dtype=_F.int)
-            elif x.dtype.kine == 'f':
-                x = x.astype(np.float32)
-                x = _F.const(x, x.shape, dtype=_F.float)
-            else:
-                raise ValueError('Just support i/f dtype numpy.')
+        try:
+            if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
+                if x.dtype.kind == 'i':
+                    x = x.astype(np.int32)
+                    x = _F.const(x, x.shape, dtype=_F.int)
+                elif x.dtype.kind == 'f':
+                    x = x.astype(np.float32)
+                    x = _F.const(x, x.shape, dtype=_F.float)
+                else:
+                    raise ValueError('Just support i/f dtype numpy.')
+        except:
+            pass
     # 3. Sequence
     if isinstance(x, _Sequence) and x:
         dst_shape, item_type = _list_shape_type(x)
@@ -202,7 +209,7 @@ def floor(x):
     >>> expr.floor([-5.1, 4.5])
     var([-6.,  4.])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.floor(x)
 def round(x):
     '''
@@ -223,7 +230,7 @@ def round(x):
     >>> expr.round([-5.1, 4.5])
     var([-5.,  5.])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.round(x)
 def ceil(x):
     '''
@@ -243,7 +250,7 @@ def ceil(x):
     >>> expr.ceil([-4.9, 4.5])
     var([-4.,  5.])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.ceil(x)
 def square(x):
     '''
@@ -283,7 +290,7 @@ def sqrt(x):
     >>> expr.sqrt([9., 4.5])
     var([3., 2.1213202])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.sqrt(x)
 def rsqrt(x):
     '''
@@ -303,7 +310,7 @@ def rsqrt(x):
     >>> expr.rsqrt([9., 4.5])
     var([0.33333334, 0.47140455])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.rsqrt(x)
 def exp(x):
     '''
@@ -323,7 +330,7 @@ def exp(x):
     >>> expr.exp([9., 4.5])
     var([8102.449, 90.01698])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.exp(x)
 def log(x):
     '''
@@ -343,7 +350,7 @@ def log(x):
     >>> expr.log([9., 4.5])
     var([2.1972246, 1.5040774])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.log(x)
 def sin(x):
     '''
@@ -363,7 +370,7 @@ def sin(x):
     >>> expr.sin([9., 4.5])
     var([0.4121185, -0.9775301])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.sin(x)
 def sinh(x):
     '''
@@ -384,7 +391,7 @@ def sinh(x):
     >>> expr.sinh([9., 4.5])
     var([4051.542, 45.00301])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.sinh(x)
 def cos(x):
     '''
@@ -404,7 +411,7 @@ def cos(x):
     >>> expr.cos([9., 4.5])
     var([-0.91113025, -0.2107958])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.cos(x)
 def cosh(x):
     '''
@@ -425,7 +432,7 @@ def cosh(x):
     >>> expr.cosh([9., 4.5])
     var([4051.542, 45.014122])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.cosh(x)
 def tan(x):
     '''
@@ -445,7 +452,7 @@ def tan(x):
     >>> expr.tan([9., 4.5])
     var([-0.45231566, 4.637332])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.tan(x)
 def tanh(x):
     '''
@@ -466,7 +473,7 @@ def tanh(x):
     >>> expr.tanh([9., 4.5])
     var([1., 0.9997533])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.tanh(x)
 def asin(x):
     '''
@@ -487,7 +494,7 @@ def asin(x):
     >>> expr.asin([9., 0.5])
     var([nan, 0.5235988])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.asin(x)
 def asinh(x):
     '''
@@ -508,7 +515,7 @@ def asinh(x):
     >>> expr.asinh([9., 0.5])
     var([2.893444, 0.4812118])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.asinh(x)
 def acos(x):
     '''
@@ -529,7 +536,7 @@ def acos(x):
     >>> expr.asin([9., 0.5])
     var([nan, 1.0471975])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.acos(x)
 def acosh(x):
     '''
@@ -550,7 +557,7 @@ def acosh(x):
     >>> expr.acosh([9., 0.5])
     var([2.887271, nan])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.acosh(x)
 def atan(x):
     '''
@@ -571,7 +578,7 @@ def atan(x):
     >>> expr.atan([9., 0.5])
     var([1.4601392, 0.4636476])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.atan(x)
 def atanh(x):
     '''
@@ -592,7 +599,7 @@ def atanh(x):
     >>> expr.atanh([9., 0.5])
     var([1.4601392, 0.4636476])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.atanh(x)
 def reciprocal(x):
     '''
@@ -612,7 +619,7 @@ def reciprocal(x):
     >>> expr.reciprocal([9., 0.5])
     var([0.11111111, 2.])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.reciprocal(x)
 def log1p(x):
     '''
@@ -632,7 +639,7 @@ def log1p(x):
     >>> expr.log1p([9., 0.5])
     var([2.3025851, 0.4054651])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.log1p(x)
 def gelu(x):
     '''
@@ -652,7 +659,7 @@ def gelu(x):
     >>> expr.gelu([9., 0.5])
     var([9., 0.345714])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.gelu(x)
 def sigmoid(x):
     '''
@@ -672,16 +679,16 @@ def sigmoid(x):
     >>> expr.sigmoid([9., 0.5])
     var([0.9998766, 0.62246716])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.sigmoid(x)
 def erf(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.erf(x)
 def erfc(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.erfc(x)
 def erfinv(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.erfinv(x)
 def expm1(x):
     '''
@@ -701,7 +708,7 @@ def expm1(x):
     >>> expr.expm1([9., 0.5])
     var([8.1014492e+03, 6.4869785e-01])
     '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
     return _F.expm1(x)
 def add(x, y):
     '''
@@ -1479,8 +1486,8 @@ def matmul(a, b, transposeA=False, transposeB=False):
     var([[0., 1.],
          [0., 3.]], dtype=float32)
     '''
-    a = _to_var(a, True)
-    b = _to_var(b, True)
+    a = _to_var(a, _F.float)
+    b = _to_var(b, _F.float)
     return _F.matmul(a, b, transposeA, transposeB)
 def normalize(x, acrossSpatial, channelShared, eps, scale):
     '''
@@ -3055,7 +3062,7 @@ def zeros_like(x):
     Example:
     -------
     >>> expr.zeros_like([[1, 2], [3, 4]])
-    array([[0, 0],
+    var([[0, 0],
            [0, 0]], dtype=int32)
     '''
     x = _to_var(x)
@@ -3078,14 +3085,72 @@ def range(start, limit, delta):
     Example:
     -------
     >>> expr.range(1.0, 7.0, 2.0)
-    array([1., 3., 5.], dtype=float32)
+    var([1., 3., 5.], dtype=float32)
     '''
     start = _to_var(start)
     limit = _to_var(limit)
     delta = _to_var(delta)
     if limit.dtype != start.dtype or delta.dtype != start.dtype:
-        print(start, limit, delta)
         raise RuntimeError("parameter start/limit/delta must use same data type, either all int or all float")
     return _F.range(start, limit, delta)
+def sort(x, axis=-1, arg=False, descend=False):
+    '''
+    sort(x, axis=-1, arg=False, descend=False)
+    Return the sorted array of ``x``.
+
+    Parameters
+    ----------
+    x : var_like, input value.
+    axis : int, sort by axis.
+    arg : is ArgSort or not, default is False.
+    descend : is descend or not, default is False.
+
+    Returns
+    -------
+    sorted_res : Var.
+
+    Example:
+    -------
+    >>> expr.sort([[5, 0], [1, 3]])
+    var([[1, 0],
+         [5, 3]], dtype=int32)
+    '''
+    x = _to_var(x)
+    # sort will change the x
+    x = clone(x, True)
+    return _F.sort(x, axis, arg, descend)
+def nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0):
+    '''
+    nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0)
+    Return the nms array of ``boxes``.
+
+    Parameters
+    ----------
+    boxes : var_like, input value, shape must be [num, 4].
+    scores : var_like, input value, shape must be [num].
+    max_detections : int.
+    iou_threshold : float, default is 0.
+    score_threshold : float, default is float_min.
+
+    Returns
+    -------
+    nms_res : Var.
+
+    Example:
+    -------
+    >>> expr.nms([[1, 1, 4, 4], [0, 0, 3, 3], [5, 5, 7, 7]], [0.9, 0.5, 0.1], 3, 0.1)
+    var([0, 2], dtype=int32)
+    '''
+    boxes = _to_var(boxes, _F.float)
+    scores = _to_var(scores, _F.float)
+    max_detections = _to_int(max_detections)
+    iou_threshold = _to_float(iou_threshold)
+    score_threshold = _to_float(score_threshold)
+    res = _F.nms(boxes, scores, max_detections, iou_threshold, score_threshold)
+    idx = res >= 0
+    idx.fix_as_const()
+    if _F.reduce_any(idx).read_as_tuple()[0] == 0:
+        return _F.const([], [0], NCHW, _F.int)
+    return res[idx]
 # TODO: detection_post_process
-# wrapper for builtin functions end
\ No newline at end of file
+# wrapper for builtin functions end
diff --git a/pymnn/pip_package/MNN/numpy/__init__.py b/pymnn/pip_package/MNN/numpy/__init__.py
index 2dc80afd..d7aa9979 100644
--- a/pymnn/pip_package/MNN/numpy/__init__.py
+++ b/pymnn/pip_package/MNN/numpy/__init__.py
@@ -19,6 +19,16 @@ inf = float('inf')
 # helper functions
 def __not_impl(*args):
     raise NotImplementedError('MNN.numpy not implemet this function now.')
+def __get_arg(kargs, key, default=None):
+    if key in kargs: return kargs[key]
+    return default
+def __get_shape(args):
+    if type(args) not in (tuple, list):
+        return [args]
+    elif len(args) == 1 and type(args[0]) in (tuple, list):
+        return args[0]
+    else:
+        return args
 def __order_assert(order):
     if order is not None and order not in 'CK':
         raise RuntimeError("MNN.numpy just support order=\"C|K\"")
@@ -89,6 +99,7 @@ def identity(n, dtype=float32):
     return eye(n, dtype=dtype)
 def full(shape, fill_value, dtype=None, order='C'):
     __order_assert(order)
+    shape = __get_shape(shape)
     return _F.fill(_F._to_var(shape), _F.scalar(fill_value, dtype))
 def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
     dst_dtype, dst_shape = __array_like_type(a, dtype, order, shape)
@@ -165,10 +176,14 @@ def __arange_3(start, stop, step=1, dtype=None):
 def __arange_1(stop, dtype=None):
     return __arange_3(0, stop, 1, dtype)
 def arange(*args, **kargs):
-    if 'dtype' in kargs: dtype=kargs['dtype']
-    else: dtype = None
-    if len(args) == 1:
+    dtype = __get_arg(kargs, 'dtype')
+    step = __get_arg(kargs, 'step')
+    stop = __get_arg(kargs, 'stop')
+    start = __get_arg(kargs, 'start')
+    if len(args) == 1 and stop is None and step is None:
         return __arange_1(args[0], dtype)
+    if len(args) == 2 and step is not None:
+        return __arange_3(*args, step=step, dtype=dtype)
     if len(args) == 4:
         return __arange_3(*args)
     return __arange_3(*args, dtype=dtype)
@@ -189,7 +204,26 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
     base = pow(stop / _F._Float(start), 1./ num)
     start = math.log(start, base)
     return logspace(start, _F._Float(num), num, endpoint, base, dtype, axis)
-def meshgrid(xi, copy=True, sparse=False, indexing='xy'): __not_impl()
+def meshgrid(*xi, **kwargs):
+    copy = __get_arg(kwargs, 'copy', True)
+    sparse = __get_arg(kwargs, 'sparse', False)
+    indexing = __get_arg(kwargs, 'indexing', 'xy')
+    ndim = len(xi)
+    if indexing not in ['xy', 'ij']:
+        raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.")
+
+    s0 = (1,) * ndim
+    output = [asanyarray(x).reshape(s0[:i] + (-1,) + s0[i + 1:]) for i, x in enumerate(xi)]
+    if indexing == 'xy' and ndim > 1:
+        # switch first and second axis
+        output[0] = swapaxes(output[0], 0, 1)
+        output[1] = swapaxes(output[1], 0, 1)
+    if not sparse:
+        # Return the full N-D matrix (not only the 1-D vector)
+        output = broadcast_arrays(*output)
+    if copy:
+        output = [x.copy() for x in output]
+    return output
 # 4. Building matrices
 def diag(v, k=0):__not_impl()
 def diagflat(v, k=0):__not_impl()
@@ -212,11 +246,11 @@ def copyto(dst, src, casting='same_kind', where=True):
 def shape(a):
     return tuple(a.shape)
 # 2. Changing array shape
-def reshape(a, newshape, order='C'):
-    __order_assert(order)
+def reshape(a, *newshape):
+    newshape = __get_shape(newshape)
     return _F.reshape(a, newshape)
 def ravel(a, order='C'):
-    return reshape(a, [-1], order)
+    return reshape(a, [-1])
 # 3. Transpose-like operations
 def moveaxis(a, source, destination):
     ndim = a.ndim
@@ -431,7 +465,9 @@ right_shift = packbits = unpackbits = binary_repr = base_repr = __not_impl
 # String operations [Not Impl]
 # Indexing routines
 # 1. Generating index arrays
-def where(condition, x, y):
+def where(condition, x=None, y=None):
+    if x is None and y is None:
+        return nonzero(condition)
     return _F.select(condition, x, y)
 def indices(dimensions, dtype=int32, sparse=False):__not_impl()
 def ix_(*args):__not_impl()
@@ -546,6 +582,7 @@ arccosh = _F.acosh
 arctanh = _F.atanh
 around = _F.round
 round_ = _F.round
+round = _F.round
 rint = _F.round
 fix = _F.round
 floor = _F.floor
@@ -685,9 +722,12 @@ def pad(array, pad_width, mode='constant'):
     return _F.pad(array, pad_width, mode)
 # Sorting, searching, and counting
 # 1. Sorting
-def sort(a, axis=- 1, kind=None, order=None):__not_impl()
-def lexsort(keys, axis=-1):__not_impl()
-def argsort(a, axis=-1, kind=None, order=None): __not_impl()
+def sort(a, axis=- 1, kind=None, order=None):
+    return _F.sort(a, axis)
+def lexsort(keys, axis=-1):
+    return sort(keys, axis)
+def argsort(a, axis=-1, kind=None, order=None):
+    return _F.sort(a, axis, True)
 def msort(a): return sort(a, axis=0)
 def sort_complex(a): __not_impl()
 def partition(a, kth, axis=- 1, kind='introselect', order=None): __not_impl()
@@ -704,6 +744,7 @@ def argwhere(a):
     mask = not_equal(a, _F.scalar(0, a.dtype))
     return _F.where(mask)
 def nonzero(a):
+    res = _F.where(a)
     res = argwhere(a)
     if a.ndim == 1:
         return (ravel(res),)
@@ -762,6 +803,13 @@ corrcoef = correlate = cov = __not_impl
 histogram = histogram2d = histogramdd = bincount = histogram_bin_edges = digitize = __not_impl
 
 # numpy ndarray functions
+def __item(self, idx):
+    if type(idx) == type(1):
+        return ravel(self)[idx]
+    elif type(idx) == tuple:
+        return self[idx]
+    else:
+        raise ValueError('item arg must be int or tuple.')
 __override_operator(_F.Var, "all", all)
 __override_operator(_F.Var, "any", any)
 __override_operator(_F.Var, "argmax", argmax)
@@ -793,6 +841,7 @@ __override_operator(_F.Var, "sum", sum)
 __override_operator(_F.Var, "swapaxes", swapaxes)
 __override_operator(_F.Var, "transpose", transpose)
 __override_operator(_F.Var, "var", var)
+__override_operator(_F.Var, "item", __item)
 
 from . import random
-from . import linalg
\ No newline at end of file
+from . import linalg
diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py
index 809dda51..ac2b9cbe 100644
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@@ -15,6 +15,10 @@ USE_TRT=False
 if len(sys.argv) > 1 and sys.argv[1] == '-trt':
     USE_TRT=True
 
+IS_INTERNAL_BUILD = False
+if os.path.isdir('../../schema/private'):
+    IS_INTERNAL_BUILD = True
+
 def build_deps():
     """ build depency """
     root_dir = os.path.dirname(os.path.dirname(os.getcwd()))
@@ -31,15 +35,16 @@ def build_deps():
     elif IS_LINUX:
         extra_opts = '-DMNN_TENSORRT=ON \
         -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
+        extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
         os.system('cmake ' + extra_opts +
             '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
             -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-            -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
+            -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4')
     else:
         os.system('cmake -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
             -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON\
             -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-            .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
+            .. && make MNN MNNTrain MNNConvert -j4')
 ################################################################################
 # Building dependent libraries
 ################################################################################
diff --git a/pymnn/pip_package/build_wheel.py b/pymnn/pip_package/build_wheel.py
index adc1596d..d028c811 100644
--- a/pymnn/pip_package/build_wheel.py
+++ b/pymnn/pip_package/build_wheel.py
@@ -8,6 +8,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
                     help='build wheel for 32bit arch, only usable on windows')
 parser.add_argument('--version', dest='version', type=str, required=True,
                     help='MNN dist version')
+parser.add_argument('--serving', dest='serving', action='store_true', default=False,
+                    help='build for internal serving, default False')
+parser.add_argument('--env', dest='env', type=str, required=False,
+                    help='build environment, e.g. :daily/pre/production')
 args = parser.parse_args()
 
 import os
@@ -23,6 +27,8 @@ if __name__ == '__main__':
     comm_args = '--version ' + args.version
     if IS_LINUX:
         comm_args += ' --plat-name=manylinux1_x86_64'
+        comm_args += ' --env ' + args.env  if args.env else ''
+        comm_args += ' --serving' if args.serving else ''
     if IS_WINDOWS:
         os.putenv('DISTUTILS_USE_SDK', '1')
         os.putenv('MSSdk', '1')
diff --git a/pymnn/pip_package/setup.py b/pymnn/pip_package/setup.py
index 0aadd7c5..20d3b071 100644
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@@ -10,6 +10,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
                     help='build wheel for 32bit arch, only usable on windows')
 parser.add_argument('--version', dest='version', type=str, required=True,
                     help='MNN dist version')
+parser.add_argument('--serving', dest='serving', action='store_true', default=False,
+                    help='build for internal serving, default False')
+parser.add_argument('--env', dest='env', type=str, required=False,
+                    help='build environment, e.g. :daily/pre/production')
 args, unknown = parser.parse_known_args()
 sys.argv = [sys.argv[0]] + unknown
 
@@ -27,7 +31,7 @@ IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
 BUILD_DIR = 'pymnn_build'
-BUILD_TYPE = 'RELEASE'
+BUILD_TYPE = 'REL_WITH_DEB_INFO'
 BUILD_ARCH = 'x64'
 if args.x86:
     BUILD_ARCH = ''
@@ -42,10 +46,12 @@ def report(*args):
 
 package_name = 'MNN'
 USE_TRT=check_env_flag('USE_TRT')
+IS_INTERNAL_BUILD = False
 
 print ("USE_TRT ", USE_TRT)
 
 if os.path.isdir('../../schema/private'):
+    IS_INTERNAL_BUILD = True
     if USE_TRT:
         print("Build Internal NNN with TRT")
         package_name = 'MNN_Internal_TRT'
@@ -81,16 +87,19 @@ def configure_extension_build():
         # extra_link_args = ['/NODEFAULTLIB:LIBCMT.LIB']
         # /MD links against DLL runtime
         # and matches the flags set for protobuf and ONNX
-        # /Z7 turns on symbolic debugging information in .obj files
+        # /Zi turns on symbolic debugging information in separate .pdb (which is same as MNN.pdb)
         # /EHa is about native C++ catch support for asynchronous
         # structured exception handling (SEH)
         # /DNOMINMAX removes builtin min/max functions
         # /wdXXXX disables warning no. XXXX
-        extra_compile_args = ['/MT', '/Z7',
+        # Some macro (related with __VA_ARGS__) defined in pymnn/src/util.h can not be process correctly 
+        # becase of MSVC bug, enable /experimental:preprocessor fix it (And Windows SDK >= 10.0.18362.1)
+        extra_compile_args = ['/MT', '/Zi',
                               '/EHa', '/DNOMINMAX',
                               '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
                               '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
-                              '/wd4275']
+                              '/wd4275', '/experimental:preprocessor']
+        extra_link_args = []
     else:
         extra_link_args = []
         extra_compile_args = [
@@ -115,7 +124,11 @@ def configure_extension_build():
         ]
         if check_env_flag('WERROR'):
             extra_compile_args.append('-Werror')
-    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS']
+    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API',  '-DPYMNN_IMGCODECS']
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        extra_compile_args += ['-DPYMNN_INTERNAL_SERVING']
+        if args.env == 'daily':
+            extra_compile_args += ['-DPYMNN_INTERNAL_SERVING_DAILY']
     root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd())))
     engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API']
     engine_libraries = []
@@ -123,13 +136,21 @@ def configure_extension_build():
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
     engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
-    print(engine_library_dirs)
     if USE_TRT:
         # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
         engine_library_dirs += ['/usr/local/cuda/lib64/']
 
+    # Logging is enabled on Linux. Add the dependencies.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        engine_library_dirs += ['/usr/include/curl/']
+
+    print(engine_library_dirs)
     engine_link_args = []
     engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")]
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "monitor_service.cc")]
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "verify_service.cc")]
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "http_util.cc")]
     engine_include_dirs = [os.path.join(root_dir, "include")]
     engine_include_dirs += [os.path.join(root_dir, "express")]
     engine_include_dirs += [os.path.join(root_dir, "express", "module")]
@@ -146,13 +167,19 @@ def configure_extension_build():
     engine_include_dirs += [os.path.join(root_dir, "schema", "current")]
     engine_include_dirs += [os.path.join(root_dir, "3rd_party",\
                                           "flatbuffers", "include")]
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        engine_include_dirs += [os.path.join(root_dir, "3rd_party", "rapidjson")]
     # cv include
     engine_include_dirs += [os.path.join(root_dir, "tools", "cv", "include")]
     engine_include_dirs += [np.get_include()]
 
     trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
     engine_depend = ['-lMNN']
-    engine_depend = ['-lMNN', '-lMNNOpenCV']
+
+    # enable logging & model authentication on linux.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        engine_depend += ['-lcurl', '-lssl', '-lcrypto']
+
     if USE_TRT:
         engine_depend += trt_depend
 
@@ -167,6 +194,9 @@ def configure_extension_build():
         # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
         tools_library_dirs += ['/usr/local/cuda/lib64/']
 
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        tools_library_dirs += ['/usr/include/curl/']
+
     tools_link_args = []
     tools_sources = [os.path.join(root_dir, "pymnn", "src", "MNNTools.cc")]
     tools_sources += [os.path.join(root_dir, "tools", "quantization",\
@@ -195,61 +225,67 @@ def configure_extension_build():
     tools_include_dirs += [os.path.join(root_dir, "source")]
     tools_include_dirs += [np.get_include()]
 
+
     tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
+    # enable logging and model authentication on linux.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        tools_depend += ['-lcurl', '-lssl', '-lcrypto']
 
     if USE_TRT:
         tools_depend += trt_depend
 
-    engine_extra_link_args = []
-    tools_extra_link_args = []
     if IS_DARWIN:
-        engine_extra_link_args += ['-Wl,-all_load']
-        engine_extra_link_args += engine_depend
-        engine_extra_link_args += ['-Wl,-noall_load']
+        engine_link_args += ['-Wl,-all_load']
+        engine_link_args += engine_depend
+        engine_link_args += ['-Wl,-noall_load']
     if IS_LINUX:
-        engine_extra_link_args += ['-Wl,--whole-archive']
-        engine_extra_link_args += engine_depend
-        engine_extra_link_args += ['-fopenmp']
-        engine_extra_link_args += ['-Wl,--no-whole-archive']
+        engine_link_args += ['-Wl,--whole-archive']
+        engine_link_args += engine_depend
+        engine_link_args += ['-fopenmp']
+        engine_link_args += ['-Wl,--no-whole-archive']
     if IS_WINDOWS:
-        engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
+        engine_link_args += ['/WHOLEARCHIVE:MNN.lib']
     if IS_DARWIN:
-        tools_extra_link_args += ['-Wl,-all_load']
-        tools_extra_link_args += tools_depend
-        tools_extra_link_args += ['-Wl,-noall_load']
+        tools_link_args += ['-Wl,-all_load']
+        tools_link_args += tools_depend
+        tools_link_args += ['-Wl,-noall_load']
     if IS_LINUX:
-        tools_extra_link_args += ['-Wl,--whole-archive']
-        tools_extra_link_args += tools_depend
-        tools_extra_link_args += ['-fopenmp']
-        tools_extra_link_args += ['-Wl,--no-whole-archive']
-        tools_extra_link_args += ['-lz']
+        tools_link_args += ['-Wl,--whole-archive']
+        tools_link_args += tools_depend
+        tools_link_args += ['-fopenmp']
+        tools_link_args += ['-Wl,--no-whole-archive']
+        tools_link_args += ['-lz']
     if IS_WINDOWS:
-        tools_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
-        tools_extra_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
+        tools_link_args += ['/WHOLEARCHIVE:MNN.lib']
+        tools_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
+        tools_link_args += ['libprotobuf.lib'] # use wholearchive will cause lnk1241 (version.rc specified)
 
     if BUILD_TYPE == 'DEBUG':
+        # Need pythonxx_d.lib, which seem not exist in miniconda ?
         if IS_WINDOWS:
-            extra_link_args.append('/DEBUG:FULL')
+            extra_compile_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
+            extra_link_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
         else:
             extra_compile_args += ['-O0', '-g']
             extra_link_args += ['-O0', '-g']
 
     if BUILD_TYPE == 'REL_WITH_DEB_INFO':
         if IS_WINDOWS:
-            extra_link_args.append('/DEBUG:FULL')
+            extra_compile_args += ['/DEBUG']
+            extra_link_args += ['/DEBUG', '/OPT:REF', '/OPT:ICF']
         else:
             extra_compile_args += ['-g']
             extra_link_args += ['-g']
 
-
+# compat with py39
     def make_relative_rpath(path):
         """ make rpath """
         if IS_DARWIN:
-            return '-Wl,-rpath,@loader_path/' + path
+            return ['-Wl,-rpath,@loader_path/' + path]
         elif IS_WINDOWS:
-            return ''
+            return []
         else:
-            return '-Wl,-rpath,$ORIGIN/' + path
+            return ['-Wl,-rpath,$ORIGIN/' + path]
 
     ################################################################################
     # Declare extensions and package
@@ -263,8 +299,8 @@ def configure_extension_build():
                     extra_compile_args=engine_compile_args + extra_compile_args,\
                     include_dirs=engine_include_dirs,\
                     library_dirs=engine_library_dirs,\
-                    extra_link_args=engine_extra_link_args + engine_link_args\
-                        + [make_relative_rpath('lib')])
+                    extra_link_args=engine_link_args + extra_link_args\
+                        + make_relative_rpath('lib'))
     extensions.append(engine)
     tools = Extension("_tools",\
                     libraries=tools_libraries,\
@@ -273,8 +309,8 @@ def configure_extension_build():
                     extra_compile_args=tools_compile_args + extra_compile_args,\
                     include_dirs=tools_include_dirs,\
                     library_dirs=tools_library_dirs,\
-                    extra_link_args=tools_extra_link_args +tools_link_args\
-                        + [make_relative_rpath('lib')])
+                    extra_link_args=tools_link_args + extra_link_args\
+                        + make_relative_rpath('lib'))
     extensions.append(tools)
     # These extensions are built by cmake and copied manually in build_extensions()
     # inside the build_ext implementaiton
diff --git a/pymnn/src/MNN.cc b/pymnn/src/MNN.cc
index 06d91481..5805f52c 100644
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@@ -19,7 +19,9 @@ static int tls_key_2 = 0;
 #include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/expr/Module.hpp>
 using namespace MNN::Express;
+#ifdef PYMNN_OPENCV_API
 #include "cv/cv.hpp"
+#endif
 #endif // PYMNN_EXPR_API
 
 #ifdef BUILD_OPTYPE
@@ -64,6 +66,12 @@ using RegularizationMethod = ParameterOptimizer::RegularizationMethod;
 #endif
 #endif
 
+#ifdef PYMNN_INTERNAL_SERVING
+#include <MNN/AutoTime.hpp>
+#include "internal/monitor_service.h"
+#include "internal/verify_service.h"
+#endif
+
 struct MNN_TLSData {
     PyObject *PyMNNHalideTypeInt = NULL;
     PyObject *PyMNNHalideTypeInt64 = NULL;
@@ -187,6 +195,10 @@ static PyObject* PyMNNInterpreter_new(struct _typeobject *type, PyObject *args,
 static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObject *kwds);
 static void PyMNNInterpreter_dealloc(PyMNNInterpreter *);
 
+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args);
+#endif
+
 static PyMethodDef PyMNNInterpreter_methods[] = {
     {"createRuntime", (PyCFunction)PyMNNInterpreter_createRuntime, METH_VARARGS | METH_STATIC, "create runtime"},
     {"createSession", (PyCFunction)PyMNNInterpreter_createSession, METH_VARARGS, "create session"},
@@ -205,6 +217,9 @@ static PyMethodDef PyMNNInterpreter_methods[] = {
     {"cache", (PyCFunction)PyMNNInterpreter_cache, METH_VARARGS, "cache current net instance"},
     {"removeCache", (PyCFunction)PyMNNInterpreter_removeCache, METH_VARARGS, "remove cache with given path"},
     {"updateSessionToModel", (PyCFunction)PyMNNInterpreter_updateSessionToModel, METH_VARARGS, "updateSessionToModel"},
+#ifdef PYMNN_INTERNAL_SERVING
+    {"createSessionWithToken", (PyCFunction)PyMNNInterpreter_createSessionWithToken, METH_VARARGS, "create session with token"},
+#endif
     {NULL}  /* Sentinel */
 };
 
@@ -681,13 +696,7 @@ static PyObject* PyMNNInterpreter_createRuntime(PyObject* self, PyObject* args)
     return res;
 }
 
-static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
-    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
-    PyObject* dict = NULL, *rtinfo_py = NULL;
-    if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
-        return NULL;
-    }
-
+static PyObject* createSession(PyMNNInterpreter *self, PyObject* dict, PyObject *rtinfo_py) {
     PyObject *f = importName("MNN", "Session");
     if (!f || !PyCallable_Check(f)) {
         PyErr_SetString(PyExc_Exception,
@@ -715,10 +724,10 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
     }
     Session* s;
     if (rtinfo_py == NULL) {
-        s = instance->interpreter->createSession(config.second.first);
+        s = self->interpreter->createSession(config.second.first);
     } else {
         auto runtimeinfo = *(RuntimeInfo*)PyCapsule_GetPointer(rtinfo_py, NULL);
-        s = instance->interpreter->createSession(config.second.first, runtimeinfo);
+        s = self->interpreter->createSession(config.second.first, runtimeinfo);
     }
     if (!s) {
         PyErr_SetString(PyExc_Exception,
@@ -727,11 +736,54 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
     }
 
     session->session = s;
-    session->modelPath = instance->modelPath;
+    session->modelPath = self->modelPath;
 
     return (PyObject *)session;
 }
 
+static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
+#ifdef PYMNN_INTERNAL_SERVING
+    PyErr_SetString(PyExc_Exception,
+                        "PyMNNInterpreter_createSession: unsupported interface, should use createSessionWithToken.");
+    return NULL;
+#endif
+    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
+    PyObject* dict = NULL, *rtinfo_py = NULL;
+    if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
+        return NULL;
+    }
+
+    return createSession(instance, dict, rtinfo_py);
+}
+
+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args) {
+    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
+    PyObject* dict = NULL, *rtinfo_py = NULL;
+    char *token = NULL;
+    char *scene = NULL;
+    char *app_key = NULL;
+    if (!PyArg_ParseTuple(args, "sss|OO", &token, &scene, &app_key, &dict, &rtinfo_py)) {
+        return NULL;
+    }
+
+    if (!token || !scene || !app_key) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNInterpreter_createSessionWithToken: input invalid, token, scene or app_key is null.");
+        return NULL;
+    }
+
+    bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
+    if (!ret) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: check token failed, return null session.");
+        return NULL;
+    }
+
+    return createSession(instance, dict, rtinfo_py);
+}
+#endif
+
 static PyObject* PyMNNInterpreter_resizeSession(PyMNNInterpreter *self, PyObject *args) {
     PyMNNSession* session = NULL;
     if (!PyArg_ParseTuple(args, "O", &session)) {
@@ -826,12 +878,27 @@ static PyObject* PyMNNInterpreter_runSession(PyMNNInterpreter *self, PyObject *a
     }
     ErrorCode r = NO_ERROR;
     Py_BEGIN_ALLOW_THREADS
+
+#ifdef PYMNN_INTERNAL_SERVING
+    Timer timer;
     r = self->interpreter->runSession(session->session);
+    float cost_time = (float)timer.durationInUs() / (float)1000;
+    MNN::Interpreter::SessionInfoCode info_type = MNN::Interpreter::BACKENDS;
+    int backendType[MNN_FORWARD_ALL];
+    self->interpreter->getSessionInfo(session->session, info_type, backendType);
+    std::string mBizCode = self->interpreter->bizCode() ? self->interpreter->bizCode() : "";
+    std::string mUuid = self->interpreter->uuid() ? self->interpreter->uuid() : "";
+    MonitorService::GetInstance().Track(cost_time, std::to_string(*backendType), "RUN_SESSION",
+                                             "PyMNNInterpreter_runSession", std::to_string(r), mBizCode, mUuid);
+#else
+    r = self->interpreter->runSession(session->session);
+#endif
+
     Py_END_ALLOW_THREADS
     return PyLong_FromLong(r);
 }
 static PyMNNTensor* getTensor() {
-    PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)&PyMNNTensorType, PyTuple_New(0), NULL);
+    PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNTensorType), PyTuple_New(0), NULL);
     if (tensor) {
         tensor->tensor = nullptr;
     }
@@ -1222,6 +1289,12 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec
         return -1;
     }
 
+#ifdef PYMNN_INTERNAL_SERVING
+    // initialize MonitorService
+    MonitorService::GetInstance().Start();
+    VerifyService::GetInstance().Start();
+#endif
+
     return 0;
 }
 
@@ -1315,7 +1388,7 @@ static PyObject* PyMNNSession_removeCache(PyMNNSession *self, PyObject *args) {
 
 /// MNN Tensor implementation
 bool isTensor(PyObject* t) {
-    return PyObject_IsInstance(t, (PyObject*)&PyMNNTensorType);
+    return PyObject_IsInstance(t, (PyObject*)PyType_FindTLSType(&PyMNNTensorType));
 }
 Tensor* toTensor(PyObject* t) {
     return ((PyMNNTensor*)t)->tensor;
@@ -1337,17 +1410,32 @@ static void PyMNNTensor_dealloc(PyMNNTensor *self) {
 
 static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
     int argc = PyTuple_Size(args);
-    PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr;
-    long dimensionType;
+    PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr, *input_var = nullptr;
+    long dimensionType = -1;
     bool parse_res = false;
     switch (argc) {
         case 0:
             // just return, using in `PyMNNInterpreter_getSessionInputAll`;
             return 0;
+#ifdef PYMNN_EXPR_API
+        case 1:
+            parse_res = PyArg_ParseTuple(args, "O", &input_var)
+                        && isVar(input_var);
+            break;
+        case 2:
+            parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
+                        && (isTensor(input_tensor) || isVar(input_tensor));
+            if (isVar(input_tensor)) {
+                input_var = input_tensor;
+                input_tensor = nullptr;
+            }
+            break;
+#else
         case 2:
             parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
                         && isTensor(input_tensor);
             break;
+#endif
         case 3:
             parse_res = PyArg_ParseTuple(args, "OOl", &shape, &dataType, &dimensionType)
                         && isInts(shape);
@@ -1361,11 +1449,35 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
     }
     if (!parse_res) {
         PyMNN_ERROR_LOG("Tensor init require args as belows:\n"
-                        "\t1. (Tensor, DimensionType)\n"
+                        "\t0. (Var)\n"
+                        "\t1. (Tensor/Var, DimensionType)\n"
                         "\t2. ([int], DataType, DimensionType)\n"
                         "\t3. ([int], DataType, tuple/ndarray, DimensionType)\n");
+        return -1;
     }
-
+#ifdef PYMNN_EXPR_API
+    // 0. create Tensor by Var
+    if (input_var) {
+        auto var = toVar(input_var);
+        auto info = var->getInfo();
+        void* ptr = const_cast<void*>(var->readMap<void>());
+        Tensor::DimensionType type = Tensor::TENSORFLOW;
+        if (dimensionType < 0) {
+            if (info->order == NCHW) type = Tensor::CAFFE;
+            else if (info->order == NC4HW4) type = Tensor::CAFFE_C4;
+        } else {
+            type = static_cast<Tensor::DimensionType>(dimensionType);
+        }
+        Tensor *tensor = Tensor::create(info->dim, info->type, ptr, type);
+        if (!tensor) {
+            PyMNN_ERROR_LOG("PyMNNTensor_create: Tensor create failed");
+            return -1;
+        }
+        self->tensor = tensor;
+        self->owner = 2;
+        return 0;
+    }
+#endif
     // 1. create Tensor by Tensor
     if (input_tensor) {
         Tensor *tensor = new Tensor(toTensor(input_tensor), (Tensor::DimensionType)dimensionType, true);
@@ -1809,8 +1921,12 @@ static PyObject* PyMNNCVImageProcess_convert(PyMNNCVImageProcess *self, PyObject
         return NULL;
     }
 
-    if (PyLong_Check(source)) {
-        ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(PyLong_AsLong(source)),
+    if (isInt(source)) {
+        auto ptr = PyLong_AsVoidPtr(source);
+        if (ptr == NULL) {
+            Py_RETURN_NONE;
+        }
+        ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(ptr),
                                                     iw, ih, stride,
                                                     ((PyMNNTensor *)dest)->tensor);
         return PyLong_FromLong(ret);
@@ -1949,46 +2065,70 @@ static PyObject* PyMNNCVImageProcess_setPadding(PyMNNCVImageProcess *self, PyObj
 
 /// MNN CVMatrix implementation
 bool isMatrix(PyObject* obj) {
-    return PyObject_IsInstance(obj, (PyObject*)&PyMNNCVMatrixType);
+    return PyObject_IsInstance(obj, (PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType));
 }
 CV::Matrix toMatrix(PyObject* obj) {
     return *(((PyMNNCVMatrix*)obj)->matrix);
 }
 PyObject* toPyObj(CV::Matrix m) {
-    PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)&PyMNNCVMatrixType, PyTuple_New(0), NULL);
+    PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType), PyTuple_New(0), NULL);
     ret->matrix = new CV::Matrix();
     *(ret->matrix) = m;
     return (PyObject*)ret;
 }
-bool isSize(PyObject* obj) {
-    return (isInts(obj) && toInts(obj).size() == 2);
-}
-CV::Size toSize(PyObject* obj) {
-    auto vals = toInts(obj);
-    MNN_ASSERT(val.size() == 2);
-    return CV::Size(vals[0], vals[1]);
-}
+
 bool isPoint(PyObject* obj) {
-    return (isFloats(obj) && toFloats(obj).size() == 2);
+    return (isFloats(obj) && toFloats(obj).size() == 2) ||
+           (isInts(obj) && toInts(obj).size() == 2);
 }
 CV::Point toPoint(PyObject* obj) {
-    auto vals = toFloats(obj);
-    MNN_ASSERT(val.size() == 2);
     CV::Point point;
-    point.set(vals[0], vals[1]);
+    if (isFloats(obj)) {
+        auto vals = toFloats(obj);
+        MNN_ASSERT(val.size() == 2);
+        point.set(vals[0], vals[1]);
+    } else if (isInts(obj)) {
+        auto vals = toInts(obj);
+        MNN_ASSERT(val.size() == 2);
+        point.set(vals[0], vals[1]);
+    }
     return point;
 }
 bool isPoints(PyObject* obj) {
-    return (isFloats(obj) && toFloats(obj).size() % 2 == 0);
+    return (isFloats(obj) && toFloats(obj).size() % 2 == 0) ||
+           (isInts(obj) && toInts(obj).size() % 2 == 0) || isVar(obj);
 }
 std::vector<CV::Point> toPoints(PyObject* obj) {
-    auto vals = toFloats(obj);
-    MNN_ASSERT(val.size() % 2 == 0);
-    std::vector<CV::Point> points(vals.size() / 2);
-    for (int i = 0; i < points.size(); i++) {
-        points[i].set(vals[i*2], vals[i*2+1]);
+    if (isFloats(obj)) {
+        auto vals = toFloats(obj);
+        MNN_ASSERT(vals.size() % 2 == 0);
+        std::vector<CV::Point> points(vals.size() / 2);
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(vals[i*2], vals[i*2+1]);
+        }
+        return points;
     }
-    return points;
+    if (isInts(obj)) {
+        auto vals = toInts(obj);
+        MNN_ASSERT(vals.size() % 2 == 0);
+        std::vector<CV::Point> points(vals.size() / 2);
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(vals[i*2], vals[i*2+1]);
+        }
+        return points;
+    }
+    if (isVar(obj)) {
+        auto vals = toVar(obj);
+        auto size = vals->getInfo()->size;
+        MNN_ASSERT(size % 2 == 0);
+        std::vector<CV::Point> points(size / 2);
+        auto ptr = vals->readMap<float>();
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(ptr[i*2], ptr[i*2+1]);
+        }
+        return points;
+    }
+    return {};
 }
 PyObject* toPyObj(std::vector<CV::Point> _points) {
     std::vector<float> points(_points.size() * 2);
@@ -2494,7 +2634,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
         PyErr_SetString(PyExc_Exception, "initMNN.expr: PyType_Ready PyMNNVarType failed");
         ERROR_RETURN
     }
-    PyModule_AddObject(expr_module, "Var", (PyObject *)&PyMNNVarType);
+    PyModule_AddObject(expr_module, "Var", (PyObject *)PyType_FindTLSType(&PyMNNVarType));
     // def enum
     def_data_format(expr_module);
     def_dtype(expr_module);
@@ -2547,6 +2687,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
     def_ThresholdTypes(cv_module);
     def_RetrievalModes(cv_module);
     def_ContourApproximationModes(cv_module);
+    def_LineTypes(cv_module);
     // add methods of cv
     constexpr int cv_method_num = sizeof(PyMNNCV_methods) / sizeof(PyMethodDef);
     for (int i = 0; i < cv_method_num; i++) {
@@ -2571,6 +2712,10 @@ void loadMNN() {
         WeImport_AppendInittab(MOD_NAME, MOD_INIT_FUNC);
     });
 }
+void* memoryToVar(const void* ptr, int h, int w, int c, int type) {
+    auto var = Express::_Const(ptr, {h, w, c}, NHWC, dtype2htype(static_cast<DType>(type)));
+    return reinterpret_cast<void*>(toPyObj(var));
+}
 static auto registerMNN = []() {
     loadMNN();
     return true;
diff --git a/pymnn/src/MNNPyBridge.h b/pymnn/src/MNNPyBridge.h
index 1027b2ea..9e702f61 100644
--- a/pymnn/src/MNNPyBridge.h
+++ b/pymnn/src/MNNPyBridge.h
@@ -17,4 +17,12 @@
 #define PYMNN_PUBLIC
 #endif // WIN32
 
-extern "C" PYMNN_PUBLIC void loadMNN();
\ No newline at end of file
+// memoryToVar's type define
+#define TypeFloat 1
+#define TypeDouble 2
+#define TypeInt 3
+#define TypeUint8 4
+#define TypeInt8 6
+#define TypeInt64 9
+extern "C" PYMNN_PUBLIC void loadMNN();
+extern "C" PYMNN_PUBLIC void* memoryToVar(void* ptr, int h, int w, int c, int type);
\ No newline at end of file
diff --git a/pymnn/src/cv.h b/pymnn/src/cv.h
index 14df2793..7e8fac4d 100644
--- a/pymnn/src/cv.h
+++ b/pymnn/src/cv.h
@@ -99,10 +99,22 @@ def_enum(ContourApproximationModes, CV::ContourApproximationModes,
         CV::CHAIN_APPROX_TC89_L1, "CHAIN_APPROX_TC89_L1",
         CV::CHAIN_APPROX_TC89_KCOS, "CHAIN_APPROX_TC89_KCOS"
         )
+def_enum(LineTypes, CV::LineTypes,
+        CV::FILLED, "FILLED",
+        CV::LINE_4, "LINE_4",
+        CV::LINE_8, "LINE_8",
+        CV::LINE_AA, "LINE_AA"
+        )
 // helper functions
 INTS default_size = {0, 0}, default_param = {};
-bool isSize(PyObject* obj);
-CV::Size toSize(PyObject* obj);
+bool isSize(PyObject* obj) {
+    return (isInts(obj) && toInts(obj).size() == 2);
+}
+CV::Size toSize(PyObject* obj) {
+    auto vals = toInts(obj);
+    MNN_ASSERT(val.size() == 2);
+    return CV::Size(vals[0], vals[1]);
+}
 bool isPoint(PyObject* obj);
 CV::Point toPoint(PyObject* obj);
 bool isPoints(PyObject* obj);
@@ -378,24 +390,28 @@ static PyObject* PyMNNCV_invertAffineTransform(PyObject *self, PyObject *args) {
     }
     PyMNN_ERROR("invertAffineTransform require args: (Matrix)");
 }
+std::vector<float> default_floats = {};
 static PyObject* PyMNNCV_resize(PyObject *self, PyObject *args) {
-    PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR);
+    PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR),
+             *mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
     float fx = 0, fy = 0;
-    if (PyArg_ParseTuple(args, "OO|ffO", &src, &dsize, &fx, &fy, &interpolation) &&
-        isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation)) {
-        return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation)));
+    int code = -1;
+    if (PyArg_ParseTuple(args, "OO|ffOiOO", &src, &dsize, &fx, &fy, &interpolation, &code, &mean, &norm) &&
+        isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation) && isFloats(mean) && isFloats(norm)) {
+        return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation), code, toFloats(mean), toFloats(norm)));
     }
-    PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags)");
+    PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags, int, [float], [float])");
 }
 static PyObject* PyMNNCV_warpAffine(PyObject *self, PyObject *args) {
-    PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
-    int borderValue = 0;
-    if (PyArg_ParseTuple(args, "OOO|OOi", &src, &M, &dsize, &flag, &borderMode, &borderValue) &&
-        isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode)) {
+    PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT),
+             *mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
+    int borderValue = 0, code = -1;
+    if (PyArg_ParseTuple(args, "OOO|OOiiOO", &src, &M, &dsize, &flag, &borderMode, &borderValue, &code, &mean, &norm) &&
+        isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode) && isFloats(mean) && isFloats(norm)) {
         return toPyObj(CV::warpAffine(toVar(src), toMatrix(M), toSize(dsize),
-                       toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue));
+                       toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue, code, toFloats(mean), toFloats(norm)));
     }
-    PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int)");
+    PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int, int, [float], [float])");
 }
 static PyObject* PyMNNCV_warpPerspective(PyObject *self, PyObject *args) {
     PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
@@ -433,7 +449,7 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
         auto contours = CV::findContours(toVar(image), toEnum<CV::RetrievalModes>(mode),
                                          toEnum<CV::ContourApproximationModes>(method), toPoint(offset));
         PyObject* obj = PyTuple_New(2);
-        PyTuple_SetItem(obj, 0, toPyObj<std::vector<CV::Point>, toPyObj>(contours));
+        PyTuple_SetItem(obj, 0, toPyObj<VARP, toPyObj>(contours));
         PyTuple_SetItem(obj, 1, toPyObj("no hierarchy"));
         return obj;
     }
@@ -442,24 +458,29 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
 static PyObject* PyMNNCV_contourArea(PyObject *self, PyObject *args) {
     PyObject *points;
     int oriented = 0;
-    if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isPoints(points)) {
-        float area = CV::contourArea(toPoints(points), oriented);
-        return toPyObj(area);
+    if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isVar(points)) {
+        float res = CV::contourArea(toVar(points), oriented);
+        return toPyObj(res);
     }
-    PyMNN_ERROR("contourArea require args: ([float], |bool)");
+    PyMNN_ERROR("contourArea require args: (Var, |bool)");
 }
 static PyObject* PyMNNCV_convexHull(PyObject *self, PyObject *args) {
     PyObject *points;
     int clockwise = 0, returnPoints = 1;
-    if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isPoints(points)) {
-        return toPyObj(CV::convexHull(toPoints(points), clockwise, returnPoints));
+    if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isVar(points)) {
+        auto res = CV::convexHull(toVar(points), clockwise, returnPoints);
+        if (returnPoints) {
+            int npoints = res.size() / 2;
+            return toPyObj(Express::_Const(res.data(), { npoints, 1, 2 }, NHWC, halide_type_of<int>()));
+        }
+        return toPyObj(res);
     }
-    PyMNN_ERROR("convexHull require args: ([float], |bool, bool)");
+    PyMNN_ERROR("convexHull require args: (Var, |bool, bool)");
 }
 static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
     PyObject *points;
-    if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
-        auto rect = CV::minAreaRect(toPoints(points));
+    if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
+        auto rect = CV::minAreaRect(toVar(points));
         PyObject* center = PyTuple_New(2);
         PyTuple_SetItem(center, 0, toPyObj(rect.center.x));
         PyTuple_SetItem(center, 1, toPyObj(rect.center.y));
@@ -472,16 +493,16 @@ static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
         PyTuple_SetItem(obj, 2, toPyObj(rect.angle));
         return obj;
     }
-    PyMNN_ERROR("minAreaRect require args: ([float])");
+    PyMNN_ERROR("minAreaRect require args: (Var)");
 }
 static PyObject* PyMNNCV_boundingRect(PyObject *self, PyObject *args) {
     PyObject *points;
-    if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
-        auto rect = CV::boundingRect(toPoints(points));
+    if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
+        auto rect = CV::boundingRect(toVar(points));
         std::vector<int> res { rect.x, rect.y, rect.width, rect.height };
         return toPyObj(res);
     }
-    PyMNN_ERROR("boundingRect require args: ([float])");
+    PyMNN_ERROR("boundingRect require args: (Var)");
 }
 static PyObject* PyMNNCV_connectedComponentsWithStats(PyObject *self, PyObject *args) {
     PyObject *image;
@@ -518,17 +539,106 @@ static PyObject* PyMNNCV_boxPoints(PyObject *self, PyObject *args) {
 error_:
     PyMNN_ERROR("boxPoints require args: [(float, (float, float), (float, float))])");
 }
+// draw
+static bool isColor(PyObject* obj) {
+    return (isInts(obj) && (toInts(obj).size() == 3 || toInts(obj).size() == 4));
+}
+CV::Scalar toColor(PyObject* obj) {
+    auto vals = toInts(obj);
+    if (vals.size() == 3) {
+        return CV::Scalar(vals[0], vals[1], vals[2]);
+    }
+    if (vals.size() == 4) {
+        return CV::Scalar(vals[0], vals[1], vals[2], vals[3]);
+    }
+    return CV::Scalar(255, 255, 255);
+}
+static PyObject* PyMNNCV_line(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::line(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                 thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("line require args: (Var, Point, Point, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_arrowedLine(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    float tipLength = 0.1;
+    if (PyArg_ParseTuple(args, "OOOO|iOif", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift, &tipLength)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::arrowedLine(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                        thickness, toEnum<CV::LineTypes>(linetype), shift, tipLength);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("arrowedLine require args: (Var, Point, Point, Color, |int, LineType, int, float)");
+}
+static PyObject* PyMNNCV_circle(PyObject *self, PyObject *args) {
+    PyObject *img, *center, *color, *linetype = toPyObj(CV::LINE_8);
+    int radius, thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOiO|iOi", &img, &center, &radius, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(center) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::circle(image, toPoint(center), radius, toColor(color),
+                 thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("circle require args: (Var, Point, int, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_rectangle(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::rectangle(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                      thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("rectangle require args: (Var, Point, Point, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_drawContours(PyObject *self, PyObject *args) {
+    PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8);
+    int contourIdx, thickness = 1;
+    if (PyArg_ParseTuple(args, "OOiO|iO", &img, &contours, &contourIdx, &color, &thickness, &linetype)
+        && isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::drawContours(image, toVec<std::vector<CV::Point>, toPoints>(contours), contourIdx, toColor(color),
+                         thickness, toEnum<CV::LineTypes>(linetype));
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("drawContours require args: (Var, [Points], int, Color, |int, LineType)");
+}
+static PyObject* PyMNNCV_fillPoly(PyObject *self, PyObject *args) {
+    PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8), *offset = toPyObj(std::vector<float>{0, 0});
+    int shift = 0;
+    if (PyArg_ParseTuple(args, "OOO|OiO", &img, &contours, &color, &linetype, &shift, &offset)
+        && isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype) && isPoint(offset)) {
+        auto image = toVar(img);
+        CV::fillPoly(image, toVec<std::vector<CV::Point>, toPoints>(contours), toColor(color),
+                     toEnum<CV::LineTypes>(linetype), shift, toPoint(offset));
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("fillPoly require args: (Var, [Points], Color, |LineType, int, Point)");
+}
 static PyMethodDef PyMNNCV_methods[] = {
-    register_methods(CV,
 #ifdef PYMNN_IMGCODECS
+    register_methods(CV,
         // imgcodecs
         haveImageReader, "haveImageReader",
         haveImageWriter, "haveImageWriter",
         imdecode, "imdecode",
         imencode, "imencode",
         imread, "imread",
-        imwrite, "imwrite",
+        imwrite, "imwrite"
+    )
 #endif
+    register_methods(CV,
         // color
         cvtColor, "cvtColor.",
         cvtColorTwoPlane, "cvtColorTwoPlane.",
@@ -569,6 +679,13 @@ static PyMethodDef PyMNNCV_methods[] = {
         minAreaRect, "minAreaRect",
         boundingRect, "boundingRect",
         connectedComponentsWithStats, "connectedComponentsWithStats",
-        boxPoints, "boxPoints"
+        boxPoints, "boxPoints",
+        // draw
+        line, "line",
+        arrowedLine, "arrowedLine",
+        circle, "circle",
+        rectangle, "rectangle",
+        drawContours, "drawContours",
+        fillPoly, "fillPoly"
     )
 };
diff --git a/pymnn/src/expr.h b/pymnn/src/expr.h
index 638b12da..fe11e3c9 100644
--- a/pymnn/src/expr.h
+++ b/pymnn/src/expr.h
@@ -63,6 +63,7 @@ def_enum(PrecisionMode, PrecisionMode,
 typedef struct {
     PyObject_HEAD
     VARP* var;
+    int iter_index;
 } PyMNNVar;
 static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
 static void PyMNNVar_dealloc(PyMNNVar *self);
@@ -137,6 +138,9 @@ static PyObject* PyMNNVar_negative(PyObject*);
 static PyObject* PyMNNVar_absolute(PyObject*);
 static Py_ssize_t PyMNNVar_length(PyObject*);
 static PyObject* PyMNNVar_subscript(PyObject*, PyObject*);
+static int PyMNNVar_ass_subscript(PyObject*, PyObject*, PyObject*);
+static PyObject* PyMNNVar_iter(PyObject*);
+static PyObject* PyMNNVar_iternext(PyObject*);
 #if PY_MAJOR_VERSION >= 3
 static PyNumberMethods PyMNNVar_as_number = {
     PyMNNVar_add,           /*nb_add*/
@@ -220,9 +224,9 @@ static PyNumberMethods PyMNNVar_as_number = {
 };
 #endif
 static PyMappingMethods PyMNNVar_as_mapping = {
-    PyMNNVar_length,    /*mp_length*/
-    PyMNNVar_subscript, /*mp_subscript*/
-    0,                  /*mp_ass_subscript*/
+    PyMNNVar_length,        /*mp_length*/
+    PyMNNVar_subscript,     /*mp_subscript*/
+    PyMNNVar_ass_subscript, /*mp_ass_subscript*/
 };
 PyObject *PyMNNVar_richcompare(PyObject *self, PyObject *other, int op);
 static PyTypeObject PyMNNVarType = {
@@ -256,8 +260,8 @@ static PyTypeObject PyMNNVarType = {
     0,                                        /*tp_clear*/
     &PyMNNVar_richcompare,                    /*tp_richcompare*/
     0,                                        /*tp_weaklistoffset*/
-    0,                                        /*tp_iter*/
-    0,                                        /*tp_iternext*/
+    &PyMNNVar_iter,                           /*tp_iter*/
+    &PyMNNVar_iternext,                       /*tp_iternext*/
     PyMNNVar_methods,                         /*tp_methods*/
     0,                                        /*tp_members*/
     PyMNNVar_getsetters,                      /*tp_getset*/
@@ -272,7 +276,7 @@ static PyTypeObject PyMNNVarType = {
 };
 // helper functions
 static PyMNNVar* getVar() {
-    PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)&PyMNNVarType, PyTuple_New(0), NULL);
+    PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNVarType), PyTuple_New(0), NULL);
     var->var = new VARP;
     return var;
 }
@@ -284,7 +288,7 @@ static PyObject* toPyObj(VARP var) {
 static bool isVar(PyObject* var) {
     return isInt(var) || isInts(var) ||
            isFloat(var) || isFloats(var) ||
-           PyObject_IsInstance(var, (PyObject*)&PyMNNVarType);
+           Py_TYPE(var) == PyType_FindTLSType(&PyMNNVarType);
 }
 static bool isVars(PyObject* var) {
     return isVec<isVar>(var);
@@ -353,21 +357,30 @@ std::pair<VARP, VARP> toVarPair(PyObject* l, PyObject* r, bool fp = false) {
 PyObject *PyMNNVar_richcompare(PyObject *l, PyObject *r, int op) {
     auto lr = toVarPair(l, r);
     auto vl = lr.first, vr = lr.second;
+    VARP res;
     switch (op) {
         case Py_LT:
-            return toPyObj(Express::_Less(vl, vr));
+            res = Express::_Less(vl, vr);
+            break;
         case Py_LE:
-            return toPyObj(Express::_LessEqual(vl, vr));
+            res = Express::_LessEqual(vl, vr);
+            break;
         case Py_EQ:
-            return toPyObj(Express::_Equal(vl, vr));
+            res = Express::_Equal(vl, vr);
+            break;
         case Py_NE:
-            return toPyObj(Express::_NotEqual(vl, vr));
+            res = Express::_NotEqual(vl, vr);
+            break;
         case Py_GT:
-            return toPyObj(Express::_Greater(vl, vr));
+            res = Express::_Greater(vl, vr);
+            break;
         case Py_GE:
-            return toPyObj(Express::_GreaterEqual(vl, vr));
+            res = Express::_GreaterEqual(vl, vr);
+            break;
+        default:
+            Py_RETURN_NONE;
     }
-    Py_RETURN_NONE;
+    return toPyObj(res);
 }
 static PyObject* PyMNNVar_add(PyObject* l, PyObject* r) {
     auto lr = toVarPair(l, r);
@@ -413,11 +426,10 @@ static Py_ssize_t PyMNNVar_length(PyObject* x) {
     }
     return size;
 }
-static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
-    std::vector<int> begin, end, strides;
-    int new_axis_mask = 0, shrink_axis_mask = 0,
-        begin_mask = 0, end_mask = 0,
-        ellipsis_mask = 0, index = 0;
+
+static void dealSlice(PyObject* slice, std::vector<int>& begin, std::vector<int>& end, std::vector<int>& strides,
+                      int& new_axis_mask, int& shrink_axis_mask, int& begin_mask, int& end_mask, int& ellipsis_mask) {
+    int index = 0;
     auto dealItem = [&](PyObject* item) {
         if (PySlice_Check(item)) {
             Py_ssize_t startl = 0, stopl = 0, stepl = 1;
@@ -437,7 +449,7 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
             if ((step == 1 && start == 0) || (step == -1 && start == -1)) {
                 begin_mask |= (1 << index);
             }
-            if ((step == 1 && stop == -1) || (step == -1 && stop == 0)) {
+            if ((step == 1 && stop == -1) || (step == -1 && stop == 0) || PY_SSIZE_T_MAX == stopl) {
                 end_mask |= (1 << index);
             }
         }
@@ -471,16 +483,136 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
     } else {
         dealItem(slice);
     }
+}
+static inline bool isIdx(PyObject* slice) {
+    return Py_TYPE(slice) == PyType_FindTLSType(&PyMNNVarType) || (PyList_Check(slice) && isInts(slice));
+}
+static bool isBoolIdx(VARP idx, int reqSize) {
+    auto size = idx->getInfo()->size;
+    bool isbool = (size == reqSize);
+    if (isbool) {
+        auto ptr = idx->readMap<int>();
+        for (int i = 0; i < size; i++) {
+            if (ptr[i] != 0 && ptr[i] != 1) {
+                return false;
+            }
+        }
+    }
+    return isbool;
+}
+static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
+    // gather: 1. 0-1 gather; 2. idx gather;
+    if (isIdx(slice)) {
+        auto val = toVar(x);
+        auto idx = toVar(slice);
+        if (val->getInfo()->size > 1 && isBoolIdx(idx, val->getInfo()->size)) {
+            // 0-1 gather -> idx gather
+            idx = Express::_Where(idx);
+            val = Express::_GatherND(val, idx);
+            val = Express::_Reshape(val, {-1});
+            return toPyObj(val);
+        }
+        auto r = Express::_Gather(val, idx);
+        r->readMap<void>();
+        return toPyObj(r);
+    }
+
+    std::vector<int> begin, end, strides;
+    int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
+    dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
     int size_ = static_cast<int>(begin.size());
     auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
     auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
     auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
-    return toPyObj(Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
-                                          ellipsis_mask, new_axis_mask, shrink_axis_mask));
+    auto res = Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
+                                      ellipsis_mask, new_axis_mask, shrink_axis_mask);
+    auto info = res->getInfo();
+    if (!info) {
+        PyMNN_ERROR("subscript: unable to get variable info");
+    }
+    // to scalar
+    if (info->dim.empty()) {
+        auto dtype = info->type;
+        if (dtype == halide_type_of<float>()) {
+            return toPyObj(res->readMap<float>()[0]);
+        }
+        if (dtype == halide_type_of<int>()) {
+            return toPyObj(res->readMap<int>()[0]);
+        }
+        if (dtype == halide_type_of<uint8_t>()) {
+            return toPyObj(res->readMap<uint8_t>()[0]);
+        }
+        if (dtype == halide_type_of<double>()) {
+            return toPyObj((float)res->readMap<double>()[0]);
+        }
+    }
+    return toPyObj(res);
+}
+
+static int PyMNNVar_ass_subscript(PyObject* x, PyObject* slice, PyObject* y) {
+    if (!isVar(x) || !isVar(y)) {
+        PyMNN_ERROR_LOG("ass_subscript require args: (Var, int/Var, int/float/Var)");
+        return -1;
+    }
+    auto var = toVar(x);
+    auto val = toVar(y);
+    auto varInfo = var->getInfo();
+    if (isIdx(slice)) {
+        auto idx = toVar(slice);
+        if (isBoolIdx(idx, varInfo->size)) {
+            idx = Express::_Where(idx);
+        }
+        auto idxDim = idx->getInfo()->dim;
+        int scatterNum = idxDim[0], scatterDim = 1;
+        if (idxDim.size() < 2) {
+            idx = Express::_Unsqueeze(idx, {-1});
+        } else {
+            scatterDim = idxDim[1];
+        }
+        // val broadcast_to [scatterNum, (scatterDim < varDim.size() ? varDim[scatterDim:] : 1)]
+        auto varDim = varInfo->dim;
+        std::vector<int> valDim(1, scatterNum);
+        if (scatterDim >= varDim.size()) {
+            valDim.push_back(1);
+        } else {
+            for (int i = scatterDim; i < varDim.size(); i++) {
+                valDim.push_back(varDim[i]);
+            }
+        }
+        val = Express::_BroadcastTo(val, _Const(valDim.data(), {static_cast<int>(valDim.size())}, NCHW, halide_type_of<int32_t>()));
+        *(((PyMNNVar*)x)->var) = Express::_ScatterNd(idx, val, Express::_Shape(var), var);
+        return 0;
+    }
+    std::vector<int> begin, end, strides;
+    int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
+    dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
+    int size_ = static_cast<int>(begin.size());
+    auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
+    auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
+    auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
+    *(((PyMNNVar*)x)->var) = Express::_StridedSliceWrite(var, begin_, end_, strides_, val, begin_mask, end_mask,
+                                                         ellipsis_mask, new_axis_mask, shrink_axis_mask);
+    return 0;
+}
+static PyObject* PyMNNVar_iter(PyObject *self) {
+    auto var = toVar(self);
+    if (var->getInfo()->dim.empty()) {
+        PyMNN_ERROR("iteration over a 0-d array");
+    }
+    Py_INCREF(self);
+    return self;
+}
+static PyObject* PyMNNVar_iternext(PyObject *self) {
+    auto idx = ((PyMNNVar*)self)->iter_index++;
+    auto var = toVar(self);
+    auto conut = var->getInfo()->dim[0];
+    if (idx >= conut) return NULL;
+    return toPyObj(Express::_Gather(var, Express::_Scalar<int>(idx)));
 }
 // PyMNNVar basic functions impl
 static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
     PyMNNVar* self = (PyMNNVar *)type->tp_alloc(type, 0);
+    self->iter_index = 0;
     self->var = nullptr;
     return (PyObject*)self;
 }
@@ -505,7 +637,7 @@ static PyObject* PyMNNVar_getshape(PyMNNVar *self, void *closure) {
     if (self->var) {
         auto info = (*(self->var))->getInfo();
         if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getshape: unable to get variable info");
         }
         shape = toPyObj(info->dim);
     }
@@ -524,7 +656,7 @@ static PyObject* PyMNNVar_getdata_format(PyMNNVar *self, void *closure) {
     if (self->var) {
         auto info = (*(self->var))->getInfo();
         if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getdata_format: unable to get variable info");
         }
         return toPyObj(info->order);
     }
@@ -534,7 +666,7 @@ static PyObject* PyMNNVar_getdtype(PyMNNVar *self, void *closure) {
     if (self->var) {
         auto info = (*(self->var))->getInfo();
         if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getdtype: unable to get variable info");
         }
         return toPyObj(htype2dtype(info->type));
     }
@@ -544,7 +676,7 @@ static PyObject* PyMNNVar_getsize(PyMNNVar *self, void *closure) {
     if (self->var) {
         auto info = (*(self->var))->getInfo();
         if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getsize: unable to get variable info");
         }
         return toPyObj(info->size);
     }
@@ -564,7 +696,7 @@ PyObject *ndim = NULL;
     if (self->var) {
         auto info = (*(self->var))->getInfo();
         if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getndim: unable to get variable info");
         }
         ndim = toPyObj((int)info->dim.size());
     }
@@ -685,13 +817,16 @@ static PyObject* PyMNNVar_resize(PyMNNVar *self, PyObject *args) {
 static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
     auto info = (*(self->var))->getInfo();
     if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("read: unable to get variable info");
     }
     auto dtype = htype2dtype(info->type);
     auto shape = info->dim;
     int64_t total_length = info->size;
     auto readptr = [self](DType dtype, INTS shape, int64_t total_length) {
         void *dataPtr = (void *) (*(self->var))->readMap<void>();
+        if (nullptr == dataPtr) {
+            PyMNN_ERROR("call to readMap meet a error");
+        }
         std::vector<npy_intp> npy_dims;
         for(const auto dim : shape) {
             npy_dims.push_back(dim);
@@ -710,9 +845,6 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
             default:
                 PyMNN_ERROR("does not support this dtype");
         }
-        if (nullptr == dataPtr) {
-            PyMNN_ERROR("call to readMap meet a error");
-        }
     };
     auto data = readptr(dtype, shape, total_length);
     (*(self->var))->unMap();
@@ -722,13 +854,16 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
 static PyObject* PyMNNVar_read_as_tuple(PyMNNVar *self, PyObject *args) {
     auto info = (*(self->var))->getInfo();
     if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("read_as_tuple: unable to get variable info");
     }
     auto dtype = htype2dtype(info->type);
     auto shape = info->dim;
     size_t total_length = info->size;
     auto readptr = [self](DType dtype, INTS shape, size_t total_length) {
         void *dataPtr = (void *) (*(self->var))->readMap<void>();
+        if (nullptr == dataPtr) {
+            PyMNN_ERROR("call to readMap meet a error");
+        }
         auto obj = PyTuple_New(total_length);
         if(DType_FLOAT == dtype) {
             auto data = (float*)dataPtr;
@@ -766,7 +901,7 @@ static PyObject* PyMNNVar_write(PyMNNVar *self, PyObject *args) {
     }
     auto info = (*(self->var))->getInfo();
     if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("write: unable to get variable info");
     }
     auto dtype = htype2dtype(info->type);
     int64_t total_length = info->size;
@@ -1042,11 +1177,15 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
             total_length *= shape[i];
         }
     }
-    auto data = toPtr(value, dtype, total_length);
     auto ret = getVar();
-    if(data) {
-        *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
-        free(data);
+    if (total_length > 0) {
+        auto data = toPtr(value, dtype, total_length);
+        if(data) {
+            *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
+            free(data);
+        }
+    } else {
+        *(ret->var) = _Const(nullptr, shape, data_format, dtype2htype(dtype));
     }
     return (PyObject *)ret;
 }
@@ -1332,6 +1471,32 @@ static PyObject* PyMNNExpr_randomuniform(PyObject *self, PyObject *args) {
     }
     PyMNN_ERROR("randomuniform require args: (Var, dtype, |float, float, int, int)");
 }
+static PyObject* PyMNNExpr_sort(PyObject *self, PyObject *args) {
+    PyObject *x;
+    int axis = -1, arg = 0, descend = 0, bykey = -1;
+    if (PyArg_ParseTuple(args, "O|iii", &x, &axis, &arg, &descend) && isVar(x)) {
+        return toPyObj(Express::_Sort(toVar(x), axis, arg, descend));
+    }
+    PyMNN_ERROR("sort require args: (Var, |int, bool, bool)");
+}
+static PyObject* PyMNNExpr_raster(PyObject *self, PyObject *args) {
+    PyObject *var, *region, *shape;
+    if (PyArg_ParseTuple(args, "OOO", &var, &region, &shape) &&
+        isVars(var) && isInts(region) && isInts(shape)) {
+        return toPyObj(Express::_Raster(toVars(var), toInts(region), toInts(shape)));
+    }
+    PyMNN_ERROR("raster require args: ([Var], [int], [int])");
+}
+static PyObject* PyMNNExpr_nms(PyObject *self, PyObject *args) {
+    PyObject *boxes, *scores;
+    int max_detections;
+    float iou_threshold = -1.0, score_threshold = -1.0;
+    if (PyArg_ParseTuple(args, "OOi|ff", &boxes, &scores, &max_detections, &iou_threshold, &score_threshold) &&
+        isVar(boxes) && isVar(scores)) {
+        return toPyObj(Express::_Nms(toVar(boxes), toVar(scores), max_detections, iou_threshold, score_threshold));
+    }
+    PyMNN_ERROR("nms require args: (Var, Var, |float, float)");
+}
 static PyObject* PyMNNExpr_detection_post_process(PyObject *self, PyObject *args) {
     PyObject *encode_boxes, *class_predictions, *anchors, *centersize_encoding;
     int num_classes, max_detections, max_class_per_detection, detections_per_class;
@@ -1508,6 +1673,9 @@ static PyMethodDef PyMNNExpr_methods[] = {
         zeros_like, "build zeros_like expr",
         unstack, "build unstack expr",
         range, "build range expr",
+        sort, "build sort expr",
+        raster, "build raster expr",
+        nms, "build nms expr",
         detection_post_process, "build detection_post_process expr"
     )
 };
diff --git a/pymnn/src/nn.h b/pymnn/src/nn.h
index 8efb45e4..1248c8d3 100644
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@@ -1,4 +1,10 @@
 #include "util.h"
+#ifdef PYMNN_INTERNAL_SERVING
+#include <MNN/AutoTime.hpp>
+#include <MNN/MNNForwardType.h>
+#include "internal/monitor_service.h"
+#include "internal/verify_service.h"
+#endif
 
 // NN Module Start
 def_class_start(_Module, Module)
@@ -19,6 +25,37 @@ def_class_methods(_Module,
     _add_parameter, "add parameter"
 )
 def_class_end(_Module, Module)
+
+static PyObject* load_module(PyObject *inputs, PyObject *outputs, PyObject *backend, PyObject *memory_mode,
+                             PyObject *power_mode, PyObject *precision_mode, const char* file_name, int dynamic,
+                             int shape_mutable, int rearrange, int thread_num) {
+
+    BackendConfig backend_config;
+    backend_config.memory = toEnum<MemoryMode>(memory_mode);
+    backend_config.power = toEnum<PowerMode>(power_mode);
+    backend_config.precision = toEnum<PrecisionMode>(precision_mode);
+
+    Module::BackendInfo backend_info;
+    backend_info.type = toEnum<MNNForwardType>(backend);
+    backend_info.config = &backend_config;
+
+    Module::Config config;
+    config.dynamic = dynamic;
+    config.shapeMutable = shape_mutable;
+    config.rearrange = rearrange;
+    config.backend = &backend_info;
+
+    auto converted_file_name = convertBytesEncodeIfNeed(file_name);
+    auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
+    if (m_ptr == nullptr) {
+        std::string mnn_errno = "load_module_from_file failed ";
+        mnn_errno = mnn_errno + std::string(file_name);
+        PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
+    }
+
+    return toPyObj(m_ptr);
+}
+
 static PyObject* PyMNN_Module_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
     PyMNN_Module *self = (PyMNN_Module *)type->tp_alloc(type, 0);
     self->ptr = Module::createEmpty({});
@@ -50,10 +87,31 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) {
         Py_RETURN_NONE;
     }
     if (isVars(input)) {
+#ifdef PYMNN_INTERNAL_SERVING
+        int status = 0;
+        Timer timer;
+        auto vars = self->ptr->onForward(toVars(input));
+        if (vars.empty()) {
+            PyMNN_ERROR("module onForward occur error.");
+            status = -1;
+        }
+
+        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
+        return toPyObj<VARP, toPyObj>(vars);
+#else
         return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(input)));
+#endif
     }
     if (isVar(input)) {
+#ifdef PYMNN_INTERNAL_SERVING
+        int status = 0;
+        Timer timer;
+        auto var = self->ptr->forward(toVar(input));
+        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
+        return toPyObj(var);
+#else
         return toPyObj(self->ptr->forward(toVar(input)));
+#endif
     }
     PyMNN_ERROR("PyMNN_Module_forward: args must be Var/[Var].");
 }
@@ -62,8 +120,22 @@ static PyObject* PyMNN_Module_onForward(PyMNN_Module *self, PyObject *args) {
     if (!PyArg_ParseTuple(args, "O", &inputs)) {
         Py_RETURN_NONE;
     }
+#ifdef PYMNN_INTERNAL_SERVING
+    int status = 0;
+    Timer timer;
+    auto vars = self->ptr->onForward(toVars(inputs));
+    if (vars.empty()) {
+        PyMNN_ERROR("module onForward occur error.");
+        status = -1;
+    }
+
+    (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_onForward");
+    return toPyObj<VARP, toPyObj>(vars);
+#else
     return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(inputs)));
+#endif
 }
+
 static PyObject* PyMNN_Module_set_name(PyMNN_Module *self, PyObject *args) {
     const char* name;
     if (!PyArg_ParseTuple(args, "s", &name)) {
@@ -125,6 +197,11 @@ static PyObject* PyMNNNN_load_module(PyObject *self, PyObject *args) {
     return toPyObj(m);
 }
 static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
+#ifdef PYMNN_INTERNAL_SERVING
+    PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file: unsupported interface, should use load_module_from_file_with_token.");
+    return NULL;
+#endif
     PyObject *inputs, *outputs, *backend, *memory_mode, *power_mode, *precision_mode;
     const char* file_name;
     int dynamic, shape_mutable, rearrange;
@@ -135,30 +212,54 @@ static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
         printf("PyArg_ParseTuple Error\n");
         return NULL;
     }
-    BackendConfig backend_config;
-    backend_config.memory = toEnum<MemoryMode>(memory_mode);
-    backend_config.power = toEnum<PowerMode>(power_mode);
-    backend_config.precision = toEnum<PrecisionMode>(precision_mode);
 
-    Module::BackendInfo backend_info;
-    backend_info.type = toEnum<MNNForwardType>(backend);
-    backend_info.config = &backend_config;
-
-    Module::Config config;
-    config.dynamic = dynamic;
-    config.shapeMutable = shape_mutable;
-    config.rearrange = rearrange;
-    config.backend = &backend_info;
-
-    auto converted_file_name = convertBytesEncodeIfNeed(file_name);
-    auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
-    if (m_ptr == nullptr) {
-        std::string mnn_errno = "load_module_from_file failed ";
-        mnn_errno = mnn_errno + std::string(file_name);
-        PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
-    }
-    return toPyObj(m_ptr);
+    return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
+     shape_mutable,  rearrange,  thread_num);
 }
+
+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNNN_load_module_from_file_with_token(PyObject *self, PyObject *args) {
+    PyObject *inputs, *outputs;
+    const char* file_name;
+    PyObject *backend = toPyObj(MNN_FORWARD_CPU);
+    PyObject *memory_mode = toPyObj(MemoryMode::Memory_Normal);
+    PyObject *power_mode = toPyObj(PowerMode::Power_Normal);;
+    PyObject *precision_mode = toPyObj(PrecisionMode::Precision_Normal);;
+    int dynamic = 0;
+    int shape_mutable = 0;
+    int rearrange = 0;
+    char *token = NULL;
+    char *scene = NULL;
+    char *app_key = NULL;
+    int thread_num = 1;
+    if (!PyArg_ParseTuple(args, "OOssss|iiiOOOOi", &inputs, &outputs, &file_name, &token, &scene, &app_key, &dynamic,
+                          &shape_mutable, &rearrange, &backend, &memory_mode, &power_mode, &precision_mode,
+                          &thread_num)) {
+        printf("PyArg_ParseTuple Error\n");
+        return NULL;
+    }
+
+    if (!token || !scene || !app_key) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: input invalid, token, scene or app_key is null.");
+        return NULL;
+    }
+
+    MonitorService::GetInstance().Start();
+    VerifyService::GetInstance().Start();
+    bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
+    if (!ret) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: check token failed, return null module.");
+        return NULL;
+    }
+
+    return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
+     shape_mutable,  rearrange,  thread_num);
+
+}
+#endif
+
 #ifdef PYMNN_TRAIN_API
 static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
     INTS default_1 = {1, 1}, default_0 = {0, 0};
@@ -221,10 +322,18 @@ static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args) {
 }
 #endif
 static PyMethodDef PyMNNNN_methods[] = {
+#ifdef PYMNN_INTERNAL_SERVING
+    register_methods(NN,
+        load_module, "load_module([Var], [Var], bool)",
+        load_module_from_file_with_token, "load_module_from_file_with_token([string], [string], filename, bool, ...)",
+        load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
+    )
+#else
     register_methods(NN,
         load_module, "load_module([Var], [Var], bool)",
         load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
     )
+#endif
 #ifdef PYMNN_TRAIN_API
     register_methods(NN,        
         conv, "conv Module",
@@ -234,4 +343,4 @@ static PyMethodDef PyMNNNN_methods[] = {
     )
 #endif
 };
-// NN Module End
\ No newline at end of file
+// NN Module End
diff --git a/pymnn/src/util.h b/pymnn/src/util.h
index c79db6db..5e3594ff 100644
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@@ -225,13 +225,16 @@ inline int getnpysize(int npy_type) {
         return 4;
       case NPY_DOUBLE:
         return 8;
-      case NPY_INT:
-        return 4;
       case NPY_INT64:
         return 8;
       case NPY_UINT8:
         return 1;
       default:
+        // NPY_INT(np.int) and NPY_INT32(np.int32) may be different enum on some platform
+        // use `if` instead of `switch case`(when NPY_INT is same as NPY_INT32, two same case value is not support)
+        if (npy_type == NPY_INT || npy_type == NPY_INT32) {
+            return 4;
+        }
         PyMNN_ERROR_LOG("does not support this npy_type");
         return 0;
     }
@@ -249,7 +252,7 @@ inline int getitemsize(int dtype, int npy_type) {
         }
         return 8;
       case DType_INT32:
-        if(npy_type != NPY_INT) {
+        if(npy_type != NPY_INT && npy_type != NPY_INT32) {
           PyMNN_ERROR_LOG("numpy type does not match");
         }
         return 4;
@@ -383,7 +386,7 @@ static bool isVec(PyObject* obj) {
             return Func(PyList_GetItem(obj, 0));
         } else return true;
     }
-    return false;
+    return Func(obj);
 }
 static inline bool isInts(PyObject* obj) {
     return isInt(obj) || isVec<isInt>(obj);
@@ -438,6 +441,7 @@ static vector<T> toVec(PyObject* obj) {
         }
         return values;
     }
+    values.push_back(Func(obj));
     return values;
 }
 static inline std::vector<int> toInts(PyObject* obj) {
@@ -586,188 +590,185 @@ static void* toPtr(PyObject *obj, DType dtype, int64_t& total_length, void* data
 // just support COND = 0 or 1
 #define arg_if(COND, THEN, ELSE) arg_concat(arg_if_, COND)(THEN, ELSE)
 #define expand_item_0(...)
-#define expand_item_1(macro, context, key, value, ITEMS...) \
+#define expand_item_1(macro, context, key, value, ...) \
     macro(context, key, value)
-#define expand_item_2(macro, context, key, value, ITEMS...) \
+#define expand_item_2(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_1(macro, context, ITEMS)
-#define expand_item_3(macro, context, key, value, ITEMS...) \
+    expand_item_1(macro, context, __VA_ARGS__)
+#define expand_item_3(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_2(macro, context, ITEMS)
-#define expand_item_4(macro, context, key, value, ITEMS...) \
+    expand_item_2(macro, context, __VA_ARGS__)
+#define expand_item_4(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_3(macro, context, ITEMS)
-#define expand_item_5(macro, context, key, value, ITEMS...) \
+    expand_item_3(macro, context, __VA_ARGS__)
+#define expand_item_5(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_4(macro, context, ITEMS)
-#define expand_item_6(macro, context, key, value, ITEMS...) \
+    expand_item_4(macro, context, __VA_ARGS__)
+#define expand_item_6(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_5(macro, context, ITEMS)
-#define expand_item_7(macro, context, key, value, ITEMS...) \
+    expand_item_5(macro, context, __VA_ARGS__)
+#define expand_item_7(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_6(macro, context, ITEMS)
-#define expand_item_8(macro, context, key, value, ITEMS...) \
+    expand_item_6(macro, context, __VA_ARGS__)
+#define expand_item_8(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_7(macro, context, ITEMS)
-#define expand_item_9(macro, context, key, value, ITEMS...) \
+    expand_item_7(macro, context, __VA_ARGS__)
+#define expand_item_9(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_8(macro, context, ITEMS)
-#define expand_item_10(macro, context, key, value, ITEMS...) \
+    expand_item_8(macro, context, __VA_ARGS__)
+#define expand_item_10(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_9(macro, context, ITEMS)
-#define expand_item_11(macro, context, key, value, ITEMS...) \
+    expand_item_9(macro, context, __VA_ARGS__)
+#define expand_item_11(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_10(macro, context, ITEMS)
-#define expand_item_12(macro, context, key, value, ITEMS...) \
+    expand_item_10(macro, context, __VA_ARGS__)
+#define expand_item_12(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_11(macro, context, ITEMS)
-#define expand_item_13(macro, context, key, value, ITEMS...) \
+    expand_item_11(macro, context, __VA_ARGS__)
+#define expand_item_13(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_12(macro, context, ITEMS)
-#define expand_item_14(macro, context, key, value, ITEMS...) \
+    expand_item_12(macro, context, __VA_ARGS__)
+#define expand_item_14(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_13(macro, context, ITEMS)
-#define expand_item_15(macro, context, key, value, ITEMS...) \
+    expand_item_13(macro, context, __VA_ARGS__)
+#define expand_item_15(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_14(macro, context, ITEMS)
-#define expand_item_16(macro, context, key, value, ITEMS...) \
+    expand_item_14(macro, context, __VA_ARGS__)
+#define expand_item_16(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_15(macro, context, ITEMS)
-#define expand_item_17(macro, context, key, value, ITEMS...) \
+    expand_item_15(macro, context, __VA_ARGS__)
+#define expand_item_17(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_16(macro, context, ITEMS)
-#define expand_item_18(macro, context, key, value, ITEMS...) \
+    expand_item_16(macro, context, __VA_ARGS__)
+#define expand_item_18(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_17(macro, context, ITEMS)
-#define expand_item_19(macro, context, key, value, ITEMS...) \
+    expand_item_17(macro, context, __VA_ARGS__)
+#define expand_item_19(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_18(macro, context, ITEMS)
-#define expand_item_20(macro, context, key, value, ITEMS...) \
+    expand_item_18(macro, context, __VA_ARGS__)
+#define expand_item_20(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_19(macro, context, ITEMS)
-#define expand_item_21(macro, context, key, value, ITEMS...) \
+    expand_item_19(macro, context, __VA_ARGS__)
+#define expand_item_21(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_20(macro, context, ITEMS)
-#define expand_item_22(macro, context, key, value, ITEMS...) \
+    expand_item_20(macro, context, __VA_ARGS__)
+#define expand_item_22(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_21(macro, context, ITEMS)
-#define expand_item_23(macro, context, key, value, ITEMS...) \
+    expand_item_21(macro, context, __VA_ARGS__)
+#define expand_item_23(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_22(macro, context, ITEMS)
-#define expand_item_24(macro, context, key, value, ITEMS...) \
+    expand_item_22(macro, context, __VA_ARGS__)
+#define expand_item_24(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_23(macro, context, ITEMS)
-#define expand_item_24(macro, context, key, value, ITEMS...) \
+    expand_item_23(macro, context, __VA_ARGS__)
+#define expand_item_25(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_23(macro, context, ITEMS)
-#define expand_item_25(macro, context, key, value, ITEMS...) \
+    expand_item_24(macro, context, __VA_ARGS__)
+#define expand_item_26(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_24(macro, context, ITEMS)
-#define expand_item_26(macro, context, key, value, ITEMS...) \
+    expand_item_25(macro, context, __VA_ARGS__)
+#define expand_item_27(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_25(macro, context, ITEMS)
-#define expand_item_27(macro, context, key, value, ITEMS...) \
+    expand_item_26(macro, context, __VA_ARGS__)
+#define expand_item_28(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_26(macro, context, ITEMS)
-#define expand_item_28(macro, context, key, value, ITEMS...) \
+    expand_item_27(macro, context, __VA_ARGS__)
+#define expand_item_29(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_27(macro, context, ITEMS)
-#define expand_item_29(macro, context, key, value, ITEMS...) \
+    expand_item_28(macro, context, __VA_ARGS__)
+#define expand_item_30(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_28(macro, context, ITEMS)
-#define expand_item_30(macro, context, key, value, ITEMS...) \
+    expand_item_29(macro, context, __VA_ARGS__)
+#define expand_item_31(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_29(macro, context, ITEMS)
-#define expand_item_31(macro, context, key, value, ITEMS...) \
+    expand_item_30(macro, context, __VA_ARGS__)
+#define expand_item_32(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_30(macro, context, ITEMS)
-#define expand_item_32(macro, context, key, value, ITEMS...) \
+    expand_item_31(macro, context, __VA_ARGS__)
+#define expand_item_33(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_31(macro, context, ITEMS)
-#define expand_item_33(macro, context, key, value, ITEMS...) \
+    expand_item_32(macro, context, __VA_ARGS__)
+#define expand_item_34(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_32(macro, context, ITEMS)
-#define expand_item_34(macro, context, key, value, ITEMS...) \
+    expand_item_33(macro, context, __VA_ARGS__)
+#define expand_item_35(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_33(macro, context, ITEMS)
-#define expand_item_35(macro, context, key, value, ITEMS...) \
+    expand_item_34(macro, context, __VA_ARGS__)
+#define expand_item_36(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_34(macro, context, ITEMS)
-#define expand_item_36(macro, context, key, value, ITEMS...) \
+    expand_item_35(macro, context, __VA_ARGS__)
+#define expand_item_37(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_35(macro, context, ITEMS)
-#define expand_item_37(macro, context, key, value, ITEMS...) \
+    expand_item_36(macro, context, __VA_ARGS__)
+#define expand_item_38(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_36(macro, context, ITEMS)
-#define expand_item_38(macro, context, key, value, ITEMS...) \
+    expand_item_37(macro, context, __VA_ARGS__)
+#define expand_item_39(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_37(macro, context, ITEMS)
-#define expand_item_39(macro, context, key, value, ITEMS...) \
+    expand_item_38(macro, context, __VA_ARGS__)
+#define expand_item_40(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_38(macro, context, ITEMS)
-#define expand_item_40(macro, context, key, value, ITEMS...) \
+    expand_item_39(macro, context, __VA_ARGS__)
+#define expand_item_41(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_39(macro, context, ITEMS)
-#define expand_item_41(macro, context, key, value, ITEMS...) \
+    expand_item_40(macro, context, __VA_ARGS__)
+#define expand_item_42(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_40(macro, context, ITEMS)
-#define expand_item_42(macro, context, key, value, ITEMS...) \
+    expand_item_41(macro, context, __VA_ARGS__)
+#define expand_item_43(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_41(macro, context, ITEMS)
-#define expand_item_43(macro, context, key, value, ITEMS...) \
+    expand_item_42(macro, context, __VA_ARGS__)
+#define expand_item_44(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_42(macro, context, ITEMS)
-#define expand_item_44(macro, context, key, value, ITEMS...) \
+    expand_item_43(macro, context, __VA_ARGS__)
+#define expand_item_45(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_43(macro, context, ITEMS)
-#define expand_item_45(macro, context, key, value, ITEMS...) \
+    expand_item_44(macro, context, __VA_ARGS__)
+#define expand_item_46(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_44(macro, context, ITEMS)
-#define expand_item_46(macro, context, key, value, ITEMS...) \
+    expand_item_45(macro, context, __VA_ARGS__)
+#define expand_item_47(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_45(macro, context, ITEMS)
-#define expand_item_47(macro, context, key, value, ITEMS...) \
+    expand_item_46(macro, context, __VA_ARGS__)
+#define expand_item_48(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_46(macro, context, ITEMS)
-#define expand_item_48(macro, context, key, value, ITEMS...) \
+    expand_item_47(macro, context, __VA_ARGS__)
+#define expand_item_49(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_47(macro, context, ITEMS)
-#define expand_item_49(macro, context, key, value, ITEMS...) \
+    expand_item_48(macro, context, __VA_ARGS__)
+#define expand_item_50(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_48(macro, context, ITEMS)
-#define expand_item_50(macro, context, key, value, ITEMS...) \
+    expand_item_49(macro, context, __VA_ARGS__)
+#define expand_item_51(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_49(macro, context, ITEMS)
-#define expand_item_51(macro, context, key, value, ITEMS...) \
+    expand_item_50(macro, context, __VA_ARGS__)
+#define expand_item_52(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_50(macro, context, ITEMS)
-#define expand_item_52(macro, context, key, value, ITEMS...) \
+    expand_item_51(macro, context, __VA_ARGS__)
+#define expand_item_53(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_51(macro, context, ITEMS)
-#define expand_item_53(macro, context, key, value, ITEMS...) \
+    expand_item_52(macro, context, __VA_ARGS__)
+#define expand_item_54(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_52(macro, context, ITEMS)
-#define expand_item_54(macro, context, key, value, ITEMS...) \
+    expand_item_53(macro, context, __VA_ARGS__)
+#define expand_item_55(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_53(macro, context, ITEMS)
-#define expand_item_55(macro, context, key, value, ITEMS...) \
+    expand_item_54(macro, context, __VA_ARGS__)
+#define expand_item_56(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_54(macro, context, ITEMS)
-#define expand_item_56(macro, context, key, value, ITEMS...) \
+    expand_item_55(macro, context, __VA_ARGS__)
+#define expand_item_57(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_55(macro, context, ITEMS)
-#define expand_item_57(macro, context, key, value, ITEMS...) \
+    expand_item_56(macro, context, __VA_ARGS__)
+#define expand_item_58(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_56(macro, context, ITEMS)
-#define expand_item_58(macro, context, key, value, ITEMS...) \
+    expand_item_57(macro, context, __VA_ARGS__)
+#define expand_item_59(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_57(macro, context, ITEMS)
-#define expand_item_59(macro, context, key, value, ITEMS...) \
+    expand_item_58(macro, context, __VA_ARGS__)
+#define expand_item_60(macro, context, key, value, ...) \
     macro(context, key, value) \
-    expand_item_58(macro, context, ITEMS)
-#define expand_item_60(macro, context, key, value, ITEMS...) \
-    macro(context, key, value) \
-    expand_item_59(macro, context, ITEMS)
+    expand_item_59(macro, context, __VA_ARGS__)
 #define expand_items(macro, context, ...) \
     arg_concat(expand_item_, arg_half_size(__VA_ARGS__))(macro, context, __VA_ARGS__)
 //------------------------ macro_utils end -------------------------
@@ -790,18 +791,6 @@ static PyObject* PyEnum_new(struct _typeobject *type, PyObject *args, PyObject *
 Py_hash_t PyEnum_hash(PyObject* x) {
     return static_cast<Py_hash_t>(((PyMNNEnum*)x)->value);
 }
-PyObject *PyEnum_richcompare(PyObject *self, PyObject *other, int op) {
-    int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value;
-    switch (op) {
-        case Py_LT: return toPyObj(l < r);
-        case Py_LE: return toPyObj(l <= r);
-        case Py_EQ: return toPyObj(l == r);
-        case Py_NE: return toPyObj(l != r);
-        case Py_GT: return toPyObj(l > r);
-        case Py_GE: return toPyObj(l >= r);
-    }
-    Py_RETURN_NONE;
-}
 static PyObject* toPyEnum(PyObject* type, int val) {
     auto args = PyTuple_New(1);
     PyTuple_SetItem((PyObject*)args, 0, PyLong_FromLong((long)val));
@@ -825,11 +814,11 @@ static T toEnum(PyObject* e) {
     PyObject_SetAttrString(scope, value, toPyObj(key)); \
     PyDict_SetItemString(dict, value, toPyObj(key));
 
-#define def_enum_repr(NAME, ITEMS...) \
+#define def_enum_repr(NAME, ...) \
 static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \
     std::string str = #NAME "."; \
     std::map<int, const char*> items = { \
-        expand_items(declare_map_item, _, ITEMS) \
+        expand_items(declare_map_item, _, __VA_ARGS__) \
     }; \
     int key = ((PyMNNEnum*)self)->value; \
     auto iter = items.find(key); \
@@ -839,22 +828,23 @@ static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \
 
 #define def_enum_to(NAME, TYPE) \
 static PyObject* toPyObj(TYPE value) { \
-    return toPyEnum((PyObject*)&PyEnum_##NAME, static_cast<int>(value)); \
+    return toPyEnum((PyObject*)PyType_FindTLSType(&PyEnum_##NAME), static_cast<int>(value)); \
 }
 
-#define def_enum_register(NAME, ITEMS...) \
+#define def_enum_register(NAME, ...) \
 static void def_##NAME(PyObject *scope) { \
-    if (PyType_Ready(&PyEnum_##NAME) < 0) { \
+    if (PyType_Ready(PyType_FindTLSType(&PyEnum_##NAME)) < 0) { \
         PyErr_SetString(PyExc_Exception, "init " #NAME ": PyType_Ready failed"); \
     } \
-    PyObject* self = (PyObject *)&PyEnum_##NAME; \
+    PyObject* self = (PyObject *)PyType_FindTLSType(&PyEnum_##NAME); \
     PyObject* dict = PyEnum_##NAME.tp_dict; \
     PyModule_AddObject(scope, #NAME, self); \
-    expand_items(register_item, NAME, ITEMS) \
+    expand_items(register_item, NAME, __VA_ARGS__) \
 }
 
-#define def_enum(NAME, TYPE, ITEMS...) \
-def_enum_repr(NAME, ITEMS) \
+#define def_enum(NAME, TYPE, ...) \
+def_enum_repr(NAME, __VA_ARGS__) \
+PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op); \
 static PyTypeObject PyEnum_##NAME = { \
     PyVarObject_HEAD_INIT(NULL, 0) \
     #NAME,                                    /*tp_name*/\
@@ -879,7 +869,7 @@ static PyTypeObject PyEnum_##NAME = { \
     "PyMNNEnum",                              /*tp_doc*/\
     0,                                        /*tp_traverse*/\
     0,                                        /*tp_clear*/\
-    &PyEnum_richcompare,                      /*tp_richcompare*/\
+    &PyEnum_##NAME##richcompare,              /*tp_richcompare*/\
     0,                                        /*tp_weaklistoffset*/\
     0,                                        /*tp_iter*/\
     0,                                        /*tp_iternext*/\
@@ -895,9 +885,22 @@ static PyTypeObject PyEnum_##NAME = { \
     0,                                        /*tp_alloc*/\
     PyEnum_new                                /*tp_new*/\
 };\
-static inline bool is##NAME(PyObject* obj) { return PyObject_IsInstance(obj, (PyObject*)&PyEnum_##NAME); } \
+static inline bool is##NAME(PyObject* obj) { return Py_TYPE(obj) == PyType_FindTLSType(&PyEnum_##NAME); } \
+PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op) { \
+    if (!is##NAME(other)) Py_RETURN_FALSE; \
+    int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value; \
+    switch (op) { \
+        case Py_LT: return toPyObj(l < r); \
+        case Py_LE: return toPyObj(l <= r); \
+        case Py_EQ: return toPyObj(l == r); \
+        case Py_NE: return toPyObj(l != r); \
+        case Py_GT: return toPyObj(l > r); \
+        case Py_GE: return toPyObj(l >= r); \
+    } \
+    Py_RETURN_FALSE; \
+} \
 def_enum_to(NAME, TYPE) \
-def_enum_register(NAME, ITEMS)
+def_enum_register(NAME, __VA_ARGS__)
 // ------------------------ enum end --------------------------
 // ------------------------ func start ------------------------
 #define def_methods(MODULE, NAME) \
@@ -996,10 +999,10 @@ static PyObject* PyMNN##SCOPE##_##NAME(PyObject *self, PyObject *args) { \
 
 #define def_class_register(NAME) \
 static void def_##NAME(PyObject *scope) { \
-    if (PyType_Ready(&PyMNN##NAME##Type) < 0) { \
+    if (PyType_Ready(PyType_FindTLSType(&PyMNN##NAME##Type)) < 0) { \
         PyErr_SetString(PyExc_Exception, "init" #NAME ": PyType_Ready PyMNN" #NAME "Type failed"); \
     } \
-    PyObject* self = (PyObject *)&PyMNN##NAME##Type; \
+    PyObject* self = (PyObject *)PyType_FindTLSType(&PyMNN##NAME##Type); \
     PyModule_AddObject(scope, #NAME, self); \
 }
 
@@ -1071,7 +1074,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
 };\
 def_class_register(NAME) \
 static PyMNN##NAME* get##NAME() { \
-    return (PyMNN##NAME *)PyObject_Call((PyObject*)&PyMNN##NAME##Type, PyTuple_New(0), NULL); \
+    return (PyMNN##NAME *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNN##NAME##Type), PyTuple_New(0), NULL); \
 } \
 static PyObject* toPyObj(TYPE* x) { \
     auto ret = get##NAME(); \
diff --git a/pymnn/test/model_test.py b/pymnn/test/model_test.py
index 78bd5e51..df939b8c 100644
--- a/pymnn/test/model_test.py
+++ b/pymnn/test/model_test.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import os
 import sys
 import MNN
@@ -10,7 +11,11 @@ def parseConfig(root_dir):
     configName = os.path.join(root_dir, 'config.txt')
     if not os.path.exists(configName):
         return False
-    config = open(configName, 'rt')
+    try:
+        config = open(configName, 'rt', encoding='utf-8')
+    except:
+        import io
+        config = io.open(configName, 'rt', encoding='utf-8')
     res = {}
     res['model_name'] = os.path.join(root_dir, 'temp.bin')
     for line in config.readlines():
diff --git a/pymnn/test/unit_test.py b/pymnn/test/unit_test.py
index 4d88657f..ac94db91 100644
--- a/pymnn/test/unit_test.py
+++ b/pymnn/test/unit_test.py
@@ -465,6 +465,14 @@ class UnitTest(unittest.TestCase):
         self.assertEqualVar(expr.range(start, limit, delta), np.arange(0.0, 2.0, 0.3))
     def test_depth_to_space(self):
         self.assertEqualVar(expr.depth_to_space(self.x, 2), torch.pixel_shuffle(self._x, 2))
+    def test_sort(self):
+        x = mp.array([5, -1, 2, 0])
+        x_ = np.array([5, -1, 2, 0])
+        self.assertEqualVar(expr.sort(x), np.sort(x_))
+    def test_raster(self):
+        x = mp.array([[1, 2], [3, 4]])
+        x_ = np.array([[1, 2], [3, 4]])
+        self.assertEqualVar(expr.raster([x], [0, 1, 1, 2, 0, 1, 2, 1, 1, 2, 2], [2, 2]), x_.transpose())
     def test_detection_post_process(self):
         pass
     # test cv
@@ -643,6 +651,40 @@ class UnitTest(unittest.TestCase):
         x = cv.threshold(self.imgf, 50, 20, cv.THRESH_BINARY)
         y = cv2.threshold(self.imgf_, 50, 20, cv2.THRESH_BINARY)[1]
         self.assertEqualImg(x, y)
+    # draw
+    def test_Draw(self):
+        x = self.img.copy()
+        y = self.img_.copy()
+        # 1. arrowedLine
+        cv.arrowedLine(x, [10, 10], [40, 40], [255, 0, 0])
+        cv2.arrowedLine(y, [10, 10], [40, 40], [255, 0, 0])
+        # 2. line
+        cv.line(x, [20, 30], [50, 60], [0, 0, 255])
+        cv2.line(y, [20, 30], [50, 60], [0, 0, 255])
+        # 3. circle
+        cv.circle(x, [70, 70], 30, [0, 255, 0])
+        cv2.circle(y, [70, 70], 30, [0, 255, 0])
+        # 4. rectangle
+        cv.rectangle(x, [80, 80], [120, 120], [0, 0, 255])
+        cv2.rectangle(y, [80, 80], [120, 120], [0, 0, 255])
+        # get contours
+        y_ = cv2.cvtColor(y, cv2.COLOR_BGR2GRAY)
+        y_ = cv2.threshold(y_, 127, 255, cv2.THRESH_BINARY)[1]
+        c_, _ = cv2.findContours(y_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        c = []
+        for a in c_:
+            ps = []
+            for b in a:
+                ps.append(int(b[0,0]))
+                ps.append(int(b[0,1]))
+            c.append(ps)
+        # 5. fillPoly
+        cv.fillPoly(x, c, [255, 0, 0])
+        cv2.fillPoly(y, c_, [255, 0, 0])
+        # 6. drawContours
+        cv.drawContours(x, c, -1, [0, 0, 255])
+        cv2.drawContours(y, c_, -1, [0, 0, 255])
+        self.assertEqualImg(x, y)
     # structural
     def test_Structural(self):
         x  = mp.array([[0,0,0,0,0,0,0,0,0,0,0,0,0],
@@ -661,17 +703,20 @@ class UnitTest(unittest.TestCase):
         contours_, _ = cv2.findContours(x_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
         contour = contours[0]
         contour_ = contours_[0]
-        self.assertEqualPoints(contour, contour_)
+        self.assertEqualVar(contour, contour_)
         self.assertEqual(cv.contourArea(contour), cv2.contourArea(contour_))
         hull = cv.convexHull(contour)
         hull_ = cv2.convexHull(contour_)
-        self.assertEqualPoints(hull, hull_)
+        if version_info.major < 3: hull_ = np.concatenate([hull_[-1::, :], hull_[:-1,:]])
+        self.assertEqualVar(hull, hull_)
         rect = cv.minAreaRect(contour)
         rect_ = cv2.minAreaRect(contour_)
-        self.assertEqual(rect, rect_)
-        points = cv.boxPoints(rect),
+        if version_info.major >= 3:
+            self.assertEqual(rect, rect_)
+        points = cv.boxPoints(rect)
         points_ = cv2.boxPoints(rect_)
-        self.assertEqualPoints(points, points_)
+        if version_info.major >= 3:
+            self.assertEqualVar(points, points_)
         self.assertEqual(tuple(cv.boundingRect(contour)), cv2.boundingRect(contour_))
         ret, labels, statsv, centroids = cv.connectedComponentsWithStats(x)
         ret_, labels_, statsv_, centroids_ = cv2.connectedComponentsWithStats(x_)
@@ -689,6 +734,16 @@ class UnitTest(unittest.TestCase):
         x = cv.hconcat([self.img, self.img])
         y = cv2.hconcat([self.img_, self.img_])
         self.assertEqualImg(x, y)
+    def test_rotate(self):
+        x = cv.rotate(self.img, cv.ROTATE_90_CLOCKWISE)
+        y = cv2.rotate(self.img_, cv2.ROTATE_90_CLOCKWISE)
+        self.assertEqualImg(x, y)
+        x = cv.rotate(self.img, cv.ROTATE_180)
+        y = cv2.rotate(self.img_, cv2.ROTATE_180)
+        self.assertEqualImg(x, y)
+        x = cv.rotate(self.img, cv.ROTATE_90_COUNTERCLOCKWISE)
+        y = cv2.rotate(self.img_, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        self.assertEqualImg(x, y)
     # numpy
     def test_from_shape_or_value(self):
         x = mp.zeros([2, 2])
@@ -724,6 +779,9 @@ class UnitTest(unittest.TestCase):
         self.assertEqualVar(mp.linspace(2.0, 3.0, num=5, endpoint=False), np.linspace(2.0, 3.0, num=5, endpoint=False))
         self.assertEqualVar(mp.logspace(2.0, 3.0, num=4, endpoint=False), np.logspace(2.0, 3.0, num=4, endpoint=False))
         self.assertEqualVar(mp.geomspace(1, 1000, num=4, endpoint=False), np.geomspace(1, 1000, num=4, endpoint=False))
+        x = mp.arange(-5, 5., 0.1)
+        y = np.arange(-5, 5., 0.1)
+        self.assertEqualVars(mp.meshgrid(x, x), np.meshgrid(y, y))
     def test_changing_array_shape(self):
         x = mp.zeros((3, 2))
         x_ = np.zeros((3, 2))
@@ -916,6 +974,11 @@ class UnitTest(unittest.TestCase):
         self.assertEqualShape(mp.random.randn(2,3).shape, np.random.randn(2,3).shape)
         self.assertEqualShape(mp.random.rand(3,2).shape, np.random.rand(3,2).shape)
         self.assertEqualShape(mp.random.randint(0, 2, [2,3]).shape, np.random.randint(0, 2, [2,3]).shape)
+    def test_sorting(self):
+        x = mp.array([[1,0,3], [0,6,5]])
+        x_ = np.array([[1,0,3], [0,6,5]])
+        self.assertEqualVar(mp.sort(x), np.sort(x_))
+        self.assertEqualVar(mp.argsort(x), np.argsort(x_))
     def test_searching_counting(self):
         x = mp.array([[1,0,3], [0,6,5]])
         x_ = np.array([[1,0,3], [0,6,5]])
@@ -980,10 +1043,12 @@ class UnitTest(unittest.TestCase):
         self.assertAlmostEqual(x.var(), x_.var())
         self.assertEqualVar(x.var(0), x_.var(0))
         self.assertEqual(len(x), len(x_))
-        self.assertEqual(x[0,1].read_as_tuple()[0], x_[0,1])
+        self.assertEqual(x[0,1], x_[0,1])
         self.assertEqualVar(x[0], x_[0])
         self.assertEqualVar(x[:], x_[:])
         self.assertEqualVar(x[:1], x_[:1])
         self.assertEqualVar(x[::-1], x_[::-1])
+        self.assertEqualVar(x[x > 2], x_[x_ > 2])
+        self.assertEqualVar(x[mp.array([1])], x_[np.array([1])])
 if __name__ == '__main__':
     unittest.main()
diff --git a/schema/current/UserDefine_generated.h b/schema/current/UserDefine_generated.h
index a072be54..2143b607 100644
--- a/schema/current/UserDefine_generated.h
+++ b/schema/current/UserDefine_generated.h
@@ -376,13 +376,15 @@ struct ImageProcessParamT : public flatbuffers::NativeTable {
   int8_t paddingValue;
   std::vector<int32_t> shape;
   DataType outputType;
+  bool draw;
   ImageProcessParamT()
       : filterType(FilterType_NEAREST),
         sourceFormat(ImageFormatType_RGBA),
         destFormat(ImageFormatType_RGBA),
         wrap(WrapType_CLAMP_TO_EDGE),
         paddingValue(0),
-        outputType(DataType_DT_INVALID) {
+        outputType(DataType_DT_INVALID),
+        draw(false) {
   }
 };
 
@@ -421,6 +423,9 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   DataType outputType() const {
     return static_cast<DataType>(GetField<int32_t>(22, 0));
   }
+  bool draw() const {
+    return GetField<uint8_t>(24, 0) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, 4) &&
@@ -437,6 +442,7 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, 20) &&
            verifier.VerifyVector(shape()) &&
            VerifyField<int32_t>(verifier, 22) &&
+           VerifyField<uint8_t>(verifier, 24) &&
            verifier.EndTable();
   }
   ImageProcessParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -477,6 +483,9 @@ struct ImageProcessParamBuilder {
   void add_outputType(DataType outputType) {
     fbb_.AddElement<int32_t>(22, static_cast<int32_t>(outputType), 0);
   }
+  void add_draw(bool draw) {
+    fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(draw), 0);
+  }
   explicit ImageProcessParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -500,7 +509,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
     flatbuffers::Offset<flatbuffers::Vector<float>> transform = 0,
     int8_t paddingValue = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
-    DataType outputType = DataType_DT_INVALID) {
+    DataType outputType = DataType_DT_INVALID,
+    bool draw = false) {
   ImageProcessParamBuilder builder_(_fbb);
   builder_.add_outputType(outputType);
   builder_.add_shape(shape);
@@ -509,6 +519,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
   builder_.add_mean(mean);
   builder_.add_destFormat(destFormat);
   builder_.add_sourceFormat(sourceFormat);
+  builder_.add_draw(draw);
   builder_.add_paddingValue(paddingValue);
   builder_.add_wrap(wrap);
   builder_.add_filterType(filterType);
@@ -597,6 +608,7 @@ inline void ImageProcessParam::UnPackTo(ImageProcessParamT *_o, const flatbuffer
   { auto _e = paddingValue(); _o->paddingValue = _e; };
   { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } };
   { auto _e = outputType(); _o->outputType = _e; };
+  { auto _e = draw(); _o->draw = _e; };
 }
 
 inline flatbuffers::Offset<ImageProcessParam> ImageProcessParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ImageProcessParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -617,6 +629,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
   auto _paddingValue = _o->paddingValue;
   auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
   auto _outputType = _o->outputType;
+  auto _draw = _o->draw;
   return MNN::CreateImageProcessParam(
       _fbb,
       _filterType,
@@ -628,7 +641,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
       _transform,
       _paddingValue,
       _shape,
-      _outputType);
+      _outputType,
+      _draw);
 }
 
 inline const flatbuffers::TypeTable *SampleModeTypeTable() {
@@ -803,7 +817,8 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
     { flatbuffers::ET_FLOAT, 1, -1 },
     { flatbuffers::ET_CHAR, 0, -1 },
     { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 0, 3 }
+    { flatbuffers::ET_INT, 0, 3 },
+    { flatbuffers::ET_BOOL, 0, -1 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     FilterTypeTypeTable,
@@ -821,10 +836,11 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
     "transform",
     "paddingValue",
     "shape",
-    "outputType"
+    "outputType",
+    "draw"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 10, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
diff --git a/schema/default/UserDefine.fbs b/schema/default/UserDefine.fbs
index 2b7a0ed5..f07737a5 100644
--- a/schema/default/UserDefine.fbs
+++ b/schema/default/UserDefine.fbs
@@ -62,4 +62,5 @@ table ImageProcessParam {
     paddingValue:byte = 0;
     shape:[int]; // shape: [N, C, H, W]
     outputType:DataType;
+    draw:bool = false;
 }
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index f4bf4059..cf01b44f 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -170,7 +170,7 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
     mPrecisionMode = precision;
     mCoreFunctions = MNNGetCoreFunctions();
     mInt8CoreFunctions = MNNGetInt8CoreFunctions();
-    mCache = new CPUResizeCache(this);
+    mCache = new CPUResizeCache;
 }
 
 CPUBackend::~CPUBackend() {
diff --git a/source/backend/cpu/CPUImageProcess.cpp b/source/backend/cpu/CPUImageProcess.cpp
index ca30a42a..cdc0d9d9 100644
--- a/source/backend/cpu/CPUImageProcess.cpp
+++ b/source/backend/cpu/CPUImageProcess.cpp
@@ -87,6 +87,19 @@ BLITTER CPUImageProcess::choose(ImageFormatType source, ImageFormatType dest) {
     return nullptr;
 }
 
+BLITTER CPUImageProcess::choose(int channelByteSize) {
+    switch (channelByteSize) {
+        case 4:
+            return MNNC4blitH;
+        case 3:
+            return MNNC3blitH;
+        case 1:
+            return MNNC1blitH;
+        default:
+            return nullptr;
+    }
+}
+
 SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool identity) {
     if (identity) {
         switch (format) {
@@ -271,10 +284,21 @@ static std::pair<int, int> _computeClip(CV::Point* points, int iw, int ih, const
 }
 
 ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input = inputs[0], output = outputs[0];
-    ih = input->height();
-    iw = input->width();
-    ic = input->channel();
+    auto input = inputs[0];
+    if (input->dimensions() == 3) {
+        ih = input->length(0);
+        iw = input->length(1);
+        ic = input->length(2);
+    } else {
+        ih = input->height();
+        iw = input->width();
+        ic = input->channel();
+    }
+    if (draw) {
+        blitter = choose(ic * inputs[0]->getType().bytes());
+        return NO_ERROR;
+    }
+    auto output = outputs[0];
     oh = output->height();
     ow = output->width();
     oc = output->channel();
@@ -321,15 +345,37 @@ ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const s
 
 ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto source = inputs[0]->host<uint8_t>();
-    auto dest = outputs[0]->host<void>();
+    void* dest = nullptr;
     CV::Point points[2];
-    int tileCount = UP_DIV(ow, CACHE_SIZE);
     auto destBytes = dtype.bytes();
-    for (int dy = 0; dy < oh; ++dy) {
+    int tileCount = UP_DIV(ow, CACHE_SIZE);
+    const int* regions = nullptr;
+    if (draw) {
+        // change input to output
+        dest = source;
+        oh = inputs[1]->length(0);
+        ow = iw;
+        oc = ic;
+        destBytes = inputs[0]->getType().bytes();
+        // draw one
+        tileCount = 1;
+        // src is color
+        samplerDest = inputs[2]->host<uint8_t>();
+        // get region info ptr
+        regions = inputs[1]->host<int>();
+    } else {
+        dest = outputs[0]->host<void>();
+    }
+    for (int i = 0; i < oh; ++i) {
+        int dy = draw ? regions[3 * i] : i;
         auto dstY = (uint8_t*)dest + dy * destBytes * ow * oc;
         for (int tIndex = 0; tIndex < tileCount; ++tIndex) {
             int xStart    = tIndex * CACHE_SIZE;
             int count     = std::min(CACHE_SIZE, ow - xStart);
+            if (draw) {
+                xStart = regions[3 * i + 1];
+                count = regions[3 * i + 2] - xStart + 1;
+            }
             auto dstStart = dstY + destBytes * oc * xStart;
           
             if (!blitFloat) {
@@ -340,7 +386,7 @@ ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const
             }
 
             // Sample
-            {
+            if (!draw) {
                 // Compute position
                 points[0].fX = xStart;
                 points[0].fY = dy;
diff --git a/source/backend/cpu/CPUImageProcess.hpp b/source/backend/cpu/CPUImageProcess.hpp
index 91071f8c..ea8349c5 100644
--- a/source/backend/cpu/CPUImageProcess.hpp
+++ b/source/backend/cpu/CPUImageProcess.hpp
@@ -23,6 +23,10 @@ typedef void (*SAMPLER)(const unsigned char* source, unsigned char* dest, CV::Po
 class CPUImageProcess : public Execution {
 public:
     CPUImageProcess(CV::ImageProcess::Config config, const CoreFunctions* coreFunctions) : Execution(nullptr), coreFunctions(coreFunctions) {
+        if (config.draw) {
+            draw = true;
+            return;
+        }
         filterType = (FilterType)config.filterType;
         wrap = (WrapType)config.wrap;
         sourceFormat = (ImageFormatType)config.sourceFormat;
@@ -40,6 +44,11 @@ public:
         paddingValue = val;
     }
     CPUImageProcess(Backend *bn, const ImageProcessParam* process) : Execution(bn) {
+        coreFunctions = static_cast<CPUBackend*>(backend())->functions();
+        draw = process->draw();
+        if (draw) {
+            return;
+        }
         filterType = process->filterType();
         wrap = process->wrap();
         sourceFormat = process->sourceFormat();
@@ -53,12 +62,12 @@ public:
             transform.set(i, process->transform()->Get(i));
         }
         transform.invert(&transformInvert);
-        coreFunctions = static_cast<CPUBackend*>(backend())->functions();
     }
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
     BLITTER choose(ImageFormatType source, ImageFormatType dest);
+    BLITTER choose(int channelByteSize);
     BLIT_FLOAT choose(ImageFormatType format, int dstBpp = 0);
     SAMPLER choose(ImageFormatType format, FilterType type, bool identity);
 private:
@@ -78,6 +87,7 @@ private:
     std::unique_ptr<uint8_t[]> samplerBuffer, blitBuffer;
     uint8_t* samplerDest = nullptr, *blitDest = nullptr;
     const CoreFunctions* coreFunctions = nullptr;
+    bool draw = false;
 };
 }; // namespace MNN
 
diff --git a/source/backend/cpu/CPUNonMaxSuppressionV2.cpp b/source/backend/cpu/CPUNonMaxSuppressionV2.cpp
index b50e49a0..41992e6d 100644
--- a/source/backend/cpu/CPUNonMaxSuppressionV2.cpp
+++ b/source/backend/cpu/CPUNonMaxSuppressionV2.cpp
@@ -117,6 +117,9 @@ ErrorCode CPUNonMaxSuppressionV2::onExecute(const std::vector<Tensor*>& inputs,
     const auto scores          = inputs[1]->host<float>();
     NonMaxSuppressionSingleClasssImpl(inputs[0], scores, maxDetections, iouThreshold, scoreThreshold, &selected);
     std::copy_n(selected.begin(), selected.size(), outputs[0]->host<int32_t>());
+    for (int i = selected.size(); i < outputs[0]->elementSize(); i++) {
+        outputs[0]->host<int32_t>()[i] = -1;
+    }
 
     return NO_ERROR;
 }
diff --git a/source/backend/cpu/CPUResizeCache.hpp b/source/backend/cpu/CPUResizeCache.hpp
index d8b4dad6..aff4523d 100644
--- a/source/backend/cpu/CPUResizeCache.hpp
+++ b/source/backend/cpu/CPUResizeCache.hpp
@@ -6,11 +6,11 @@
 #include "MNN_generated.h"
 
 namespace MNN {
-class CPUBackend;
-class CPUResizeCache {
+// FIXME: Move outside
+class MNN_PUBLIC CPUResizeCache {
 public:
-    CPUResizeCache(const CPUBackend* backend) {
-        mBackend = backend;
+    CPUResizeCache() {
+        // Do nothing
     }
     ~ CPUResizeCache() {
         // Do nothing
@@ -21,7 +21,6 @@ public:
     void reset();
 private:
     std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
-    const CPUBackend* mBackend;
 };
 }
 
diff --git a/source/backend/cpu/CPUScatterNd.cpp b/source/backend/cpu/CPUScatterNd.cpp
index 7cf11755..94ac0e49 100644
--- a/source/backend/cpu/CPUScatterNd.cpp
+++ b/source/backend/cpu/CPUScatterNd.cpp
@@ -45,7 +45,7 @@ void ScatterNdImpl(const Tensor* indices, const Tensor* updates, const Tensor* s
         }
         if (valid) {
             for (int k = 0; k < accNumber; ++k) {
-                outputPtr[pos + k] += updatesPtr[i * accNumber + k];
+                outputPtr[pos + k] = updatesPtr[i * accNumber + k];
             }
         }
     }
@@ -59,7 +59,12 @@ ErrorCode CPUScatterNd::onExecute(const std::vector<Tensor*>& inputs, const std:
     const int outputSize = output->size();
 
     auto outputRawPtr = output->host<int8_t>();
-    memset(outputRawPtr, 0, outputSize);
+    if (inputs.size() < 4) {
+        memset(outputRawPtr, 0, outputSize);
+    } else {
+        auto inputRawPtr = inputs[3]->host<int8_t>();
+        memcpy(outputRawPtr, inputRawPtr, outputSize);
+    }
 
     auto updatesDataType = updates->getType();
     if (updatesDataType == halide_type_of<int32_t>()) {
diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp
index 206a1743..4fa30af3 100644
--- a/source/backend/cpu/compute/ImageProcessFunction.cpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.cpp
@@ -1065,3 +1065,21 @@ void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN
     auto countC2 = ((count + 1) / 2);
     _swapUV(destUV, destUV, countC2);
 }
+
+void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + 3 * i, source, 3);
+    }
+}
+
+void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + 4 * i, source, 4);
+    }
+}
+
+void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + i, source, 1);
+    }
+}
diff --git a/source/backend/cpu/compute/ImageProcessFunction.hpp b/source/backend/cpu/compute/ImageProcessFunction.hpp
index 13b54b7d..23c8d90a 100644
--- a/source/backend/cpu/compute/ImageProcessFunction.hpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.hpp
@@ -132,4 +132,8 @@ void MNNSamplerNV12Copy(const unsigned char* source, unsigned char* dest, MNN::C
                         size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
 void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
                            size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+// draw blit
+void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count);
 #endif /* ImageProcessFunction_hpp */
diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt
index bfaa4efe..2ded7bdd 100644
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@@ -1,29 +1,72 @@
+# Process asm file on Windows, then subsitute *.S by *.S.obj as source file of add_library
+# If MNN_ASSEMBLER env var is not set, ignore *.S file, which may cause low performance
+set(EXTRA_OBJS "")
+IF(MSVC AND (DEFINED ENV{MNN_ASSEMBLER}) AND "${CMAKE_SIZEOF_VOID_P}" STREQUAL "8")
+    set(WIN_USE_ASM ON)
+ENDIF()
+message(STATUS "WIN_USE_ASM: ${WIN_USE_ASM}")
+function (process_asm TARGET_NAME FILE_SRCS)
+    if(NOT MSVC)
+        return()
+    endif()
+    set(FILE_DESTS "")
+    foreach(SRC ${${FILE_SRCS}})
+        get_filename_component(SRC_EXT ${SRC} EXT)
+        if(NOT ${SRC_EXT} STREQUAL ".S")
+            list(APPEND FILE_DESTS ${SRC})
+            continue()
+        elseif(NOT WIN_USE_ASM)
+            continue()
+        endif()
+        string(REPLACE ${CMAKE_CURRENT_SOURCE_DIR} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TARGET_NAME}.dir" DEST ${SRC})
+        add_custom_command(
+            OUTPUT ${DEST}.obj
+            # *.S -> *.S.i: do preprocess(define/ifdef macro) by cl.exe
+            COMMAND "${CMAKE_C_COMPILER}" /DWIN32 /experimental:preprocessor /P /Fi"${DEST}.i" "${SRC}"
+            # *.S.i -> *.S.obj, use gnu assembler which support (AT&T syntax)
+            COMMAND "$ENV{MNN_ASSEMBLER}" -o "${DEST}.obj" "${DEST}.i"
+        )
+        list(APPEND EXTRA_OBJS ${DEST}.obj)
+    endforeach()
+    set(${FILE_SRCS} ${FILE_DESTS} PARENT_SCOPE)
+    set(EXTRA_OBJS ${EXTRA_OBJS} PARENT_SCOPE)
+endfunction()
+
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
     message(STATUS "${CMAKE_SYSTEM_PROCESSOR}: Open SSE")
     target_compile_options(MNNCPU PRIVATE -DMNN_USE_SSE)
     option(MNN_AVX512_VNNI "Enable AVX512 VNNI" ON)
     FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*)
-    if (MSVC)
-        FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp)
-        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp)
-    else()
-        FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
-        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
-        message(STATUS "MNN_AVX512:${MNN_AVX512}")
-        if (MNN_AVX512)
-            FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
-            SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
-            LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
-            add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
-            target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
-            if (MNN_AVX512_VNNI)
-                target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
-                add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
-                target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni -DMNN_AVX512_VNNI)
+    FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
+    FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
+    message(STATUS "MNN_AVX512:${MNN_AVX512}")
+    if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
+        FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
+        SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
+        LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
+        process_asm(MNNAVX512 MNN_AVX512_SRC)
+        add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
+        target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM)
+        if (MSVC)
+            target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
+        else()
+            target_compile_options(MNNAVX512 PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
+        endif()
+        if (MNN_AVX512_VNNI)
+            target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
+            add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
+            target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_AVX512_VNNI)
+            if (MSVC)
+                target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
+            else()
+                target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni)
             endif()
         endif()
     endif()
     FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*)
+    process_asm(MNNAVX MNN_AVX_SRC)
+    process_asm(MNNAVXFMA MNN_AVXFMA_SRC)
+    process_asm(MNNSSE MNN_SSE_SRC)
     add_library(MNNX8664 OBJECT ${MNN_X8664_SRC})
     add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
     add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
@@ -34,7 +77,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
     target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE)
     if(MSVC)
         target_compile_options(MNNAVX PRIVATE /arch:AVX)
-        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX)
+        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2)
     else()
         target_compile_options(MNNSSE PRIVATE -msse4.1)
         target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
@@ -47,7 +90,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
         endif()
     endif()
     list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
-    if (MNN_AVX512)
+    if (MSVC AND WIN_USE_ASM)
+        target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM)
+        target_compile_options(MNNAVXFMA PRIVATE -DMNN_X86_USE_ASM)
+        list(APPEND MNN_OBJECTS_TO_LINK ${EXTRA_OBJS})
+    endif()
+    if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
         target_compile_options(MNNCPU PRIVATE -DMNN_AVX512)
         target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512)
         if (MNN_AVX512_VNNI)
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
index 473048d0..73cbcc02 100644
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain
 
 
 // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
 pushq   %rbp
 movq    %rsp, %rbp
 
 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -41,6 +42,17 @@ movq %r9, %rcx
 movq %r10, %r9
 pushq   %r14
 pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@@ -304,6 +316,17 @@ addq $64, %rsp
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
index 91265c54..cb6a7690 100644
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
@@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1
 
 
 // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
 pushq   %rbp
 movq    %rsp, %rbp
 
 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -41,6 +42,17 @@ movq %r9, %rcx
 movq %r10, %r9
 pushq   %r14
 pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@@ -190,6 +202,17 @@ addq $64, %rsp
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S
index fb2a3d96..514a4d00 100644
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S
@@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1EFMA_ASM
 
 
 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
 
 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@@ -42,7 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
-
+#endif
 
 movq (%rdi),    %rax    // %rax C
 movq 8(%rdi),   %rbx    // %rbx A
@@ -215,6 +239,27 @@ LoopE24H1:
             jmp LoopE24H1
 
 End:
+
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@@ -223,6 +268,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq
 
diff --git a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S
index 8fd31b41..02191c15 100644
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S
@@ -30,10 +30,33 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4EFMA_ASM
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@@ -42,6 +65,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif
 
 movq (%rdi),    %rax        // %rax C
 movq 8(%rdi),   %rbx        // %rbx A
@@ -216,6 +240,26 @@ LoopE24H4:
             jmp LoopE24H4
     
 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@@ -224,6 +268,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq
 
diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp
index 9bac9e53..5bf63b92 100644
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp
@@ -26,23 +26,29 @@ constexpr int AVX512F32 = 16;
         _mm_store_ps(dest + AVX512F32 * packCUnit * ablock + 4 * packCUnit * aSegment + packCUnit * 3, m128_3); \
     }
 
-#define STORE_VECTOR_AS_COLUMN(dest, ablock, packCUnit, vacc)         \
-    dest[AVX512F32 * packCUnit * ablock + 0]              = vacc[0];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit]      = vacc[1];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 2]  = vacc[2];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 3]  = vacc[3];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 4]  = vacc[4];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 5]  = vacc[5];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 6]  = vacc[6];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 7]  = vacc[7];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 8]  = vacc[8];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 9]  = vacc[9];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc[10]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc[11]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc[12]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc[13]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc[14]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc[15];
+inline void STORE_VECTOR_AS_COLUMN(float* dest, size_t ablock, size_t packCUnit, __m512 vacc) {
+    union {
+        __m512 v;
+        float f[16];
+    } vacc_u;
+    vacc_u.v = vacc;
+    dest[AVX512F32 * packCUnit * ablock + 0]              = vacc_u.f[0];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit]      = vacc_u.f[1];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 2]  = vacc_u.f[2];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 3]  = vacc_u.f[3];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 4]  = vacc_u.f[4];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 5]  = vacc_u.f[5];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 6]  = vacc_u.f[6];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 7]  = vacc_u.f[7];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 8]  = vacc_u.f[8];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 9]  = vacc_u.f[9];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc_u.f[10];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc_u.f[11];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc_u.f[12];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc_u.f[13];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc_u.f[14];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc_u.f[15];
+}
 
 #define TRANSPOSE4x8_STORE(dest, ablock, aSegment, packCUnit, v0, v3, v6, v9, v12, v15, v18, v21) {        \
     auto m0 = _mm512_extractf32x4_ps(v0, aSegment);                                                         \
@@ -125,14 +131,20 @@ constexpr int AVX512F32 = 16;
         _mm256_storeu_ps(dest + packCUnit * 7, t7);                                             \
     }
 
-#define STORE_M256_VECTOR_AS_COLUMN(dest, packCUnit, vacc) \
-    dest[0]             = vacc[0];                         \
-    dest[packCUnit]     = vacc[1];                         \
-    dest[packCUnit * 2] = vacc[2];                         \
-    dest[packCUnit * 3] = vacc[3];                         \
-    dest[packCUnit * 4] = vacc[4];                         \
-    dest[packCUnit * 5] = vacc[5];                         \
-    dest[packCUnit * 6] = vacc[6];                         \
-    dest[packCUnit * 7] = vacc[7];
+inline void STORE_M256_VECTOR_AS_COLUMN(float* dest, size_t packCUnit, __m256 vacc) {
+    union {
+        __m256 v;
+        float f[8];
+    } vacc_u;
+    vacc_u.v = vacc;
+    dest[0]             = vacc_u.f[0];
+    dest[packCUnit]     = vacc_u.f[1];
+    dest[packCUnit * 2] = vacc_u.f[2];
+    dest[packCUnit * 3] = vacc_u.f[3];
+    dest[packCUnit * 4] = vacc_u.f[4];
+    dest[packCUnit * 5] = vacc_u.f[5];
+    dest[packCUnit * 6] = vacc_u.f[6];
+    dest[packCUnit * 7] = vacc_u.f[7];
+}
 
-#endif
+#endif
\ No newline at end of file
diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp
index 4e788f04..adce20a1 100644
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp
@@ -228,9 +228,14 @@ void _AVX512_MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B,
             vacc0 = _mm256_min_ps(vacc0, _mm512_extractf32x8_ps(vmax, 0));
             vacc0 = _mm256_max_ps(vacc0, _mm512_extractf32x8_ps(vmin, 0));
 
+            union {
+                __m256 v;
+                float f[8];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
             // how to store faster: st4 / transpose
             for (auto iStore = 0; iStore < (taileSize & 0x07); iStore++) {
-                 c[packCUnit * iStore] = vacc0[iStore];
+                 c[packCUnit * iStore] = vacc0_u.f[iStore];
             }
         }
         // ie += taileSize;
diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp
index e602718f..7cbd097a 100644
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp
@@ -647,10 +647,15 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
             vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
             vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));
 
-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
-            c[packCUnit * 2] = vacc0[2];
-            c[+packCUnit * 3] = vacc0[3];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
+            c[packCUnit * 2] = vacc0_u.f[2];
+            c[+packCUnit * 3] = vacc0_u.f[3];
         }
         ie += 4;
         a += 4;
@@ -735,8 +740,13 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
             vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
             vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));
 
-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
         }
         ie += 2;
         a += 2;
diff --git a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp
index 3d22dd16..5f7ffa94 100644
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp
@@ -789,10 +789,15 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
             vacc0 = _mm_min_ps(vacc0, vmax);
             vacc0 = _mm_max_ps(vacc0, vmin);
 
-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
-            c[packCUnit * 2] = vacc0[2];
-            c[packCUnit * 3] = vacc0[3];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
+            c[packCUnit * 2] = vacc0_u.f[2];
+            c[packCUnit * 3] = vacc0_u.f[3];
         }
         ie += 4;
         a += 4;
@@ -877,8 +882,13 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
             vacc0 = _mm_min_ps(vacc0, vmax);
             vacc0 = _mm_max_ps(vacc0, vmin);
 
-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
         }
         ie += 2;
         a += 2;
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S
index 874076da..57e3c339 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S
@@ -11,16 +11,15 @@
 .align 4
 
 asm_function _AVX512_MNNGemmFloatUnit16x8
-//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter)
 
-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx
 
 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -30,12 +29,21 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
 pushq   %r14
-movq %r8, %r9
 #endif
 
 movq 40(%rcx), %r10 // bExtraStride
@@ -266,6 +274,17 @@ LoopDz:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S
index ada1c521..d3ff9575 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S
@@ -11,16 +11,15 @@
 .align 4
 
 asm_function _AVX512_MNNGemmFloatUnit32x8
-//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter)
 
-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx
 
 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -30,12 +29,21 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
 pushq   %r14
-movq %r8, %r9
 #endif
 
 movq 40(%rcx), %r10 // bExtraStride
@@ -301,6 +309,17 @@ LoopDz:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S
index 6fc81941..72068a1e 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S
@@ -11,16 +11,15 @@
 .align 4
 
 asm_function _AVX512_MNNGemmFloatUnit48x8
-//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter)
 
-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx
 
 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -29,11 +28,20 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
-movq %r8, %r9
 #endif
 
 movq 40(%rcx), %r10 // bExtraStride
@@ -336,10 +344,22 @@ LoopDz:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
 popq    %rdi
+popq    %rbx
 popq    %rbp
 #else
 popq    %r13
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S
index a5396e32..da85739b 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S
@@ -14,9 +14,22 @@ asm_function _AVX512_MNNGemmFloatUnit48x8Fused
 //void _AVX512_MNNGemmFloatUnit48x8Fused(float* C, const float* A, const float* B, const size_t* parameter, const float* p, const float* bias)
 
 // SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: postParameters, r9:bias
+
+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// stack: postParameters, bias
 pushq   %rbp
 movq    %rsp, %rbp
 
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r8 // postParameters
+movq (push_registers_bytes + 8)(%rsp), %r9 // bias
 pushq   %rbx
 pushq   %r12
 pushq   %r13
@@ -24,6 +37,26 @@ pushq   %r14
 pushq   %r15
 movq %r8, %r14
 movq %r9, %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+movq %r8, %r14
+movq %r9, %r15
+#endif
 
 movq 40(%rcx), %r10 // bExtraStride
 movq 24(%rcx), %r8 // cStride
@@ -402,12 +435,33 @@ LoopDz:
 
 End:
 
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
 popq    %r12
 popq    %rbx
-popq    %rbp
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+#endif
 
+popq    %rbp
 retq
 
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S
index cfca491b..384e80a6 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S
@@ -12,8 +12,6 @@
 
 #define AVX512F32 16
 
-#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
-
 // caution: asm version is a sub-loop of _AVX512_MNNPackedSparseMatMulEpx4()
 // void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
 //                                     const float* postParameters, const float* bias, unsigned int* NNZMap,
@@ -22,8 +20,29 @@ asm_function _AVX512_MNNPackedSparseMatMulEpx4_ASM
 // SystemV Auto: rdi: C, rsi: A, rdx:B,  rcx: eSize, r8: parameter, r9: postparameter,
 // stack: bias, unsigned int* NNZMap, int* dataOffsetMap
 
+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:eSize
+// stack: parameter, postParameters, bias, unsigned int* NNZMap, int* dataOffsetMap
+
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+#define push_registers_bytes_ ((8 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes_)(%rsp), %r8 // parameter
+movq (push_registers_bytes_ + 8)(%rsp), %r9 // postparameter
+#define push_registers_bytes (push_registers_bytes_ + 2 * 8) // pushq + callq + shadow_space + extra
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@@ -32,7 +51,8 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
-
+#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
+#endif
 
 movq (%r8), %r10 // eP * sizeof
 shrq  $(sizeof_value_lg2), %r10
@@ -65,8 +85,8 @@ vbroadcastss 8(%r9), %zmm10
 vbroadcastss 12(%r9), %zmm11
 
 movq %r10, %r14
-shrq $sparse_blockoc_log, %r14
-shlq $sparse_blockoc_log, %r14 // h even divid sparse_blockoc
+shrq $(sparse_blockoc_log), %r14
+shlq $(sparse_blockoc_log), %r14 // h even divid sparse_blockoc
 
 movq (push_registers_bytes)(%rsp), %rdx // bias
 movq (push_registers_bytes + 8)(%rsp), %rdi // unsigned int* NNZMap,
@@ -79,6 +99,20 @@ movq (push_registers_bytes + 16)(%rsp), %rsi // int* dataOffsetMap
 // movq %r8, %rdi
 // movq %r9, %rsi
 
+#ifdef WIN32
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#endif
+
 movslq (%rsi), %r15
 leaq (%rax, %r15, 4), %rax // a = a + diff;
 addq $4, %rsi // dataOffsetMap++
@@ -90,7 +124,7 @@ je loop_e48h4_end
 loop_e48h4:
     movq %r8, %r9
     movq %r8, %r12
-    shrq $packC_unit_log, %r9
+    shrq $(packC_unit_log), %r9
     andq $15, %r12 // ih % packC_unit
     leaq (%rcx, %r12, sizeof_value), %r12
     imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@@ -246,7 +280,7 @@ loop_e48h4:
 
     subq $4, %rsi // dataOffsetMap--
     movslq (%rsi), %r15
-    addq $sparse_blockoc, %r8
+    addq $(sparse_blockoc), %r8
     addq $4, %rdi
     negq %r15
     leaq (%rax, %r15, sizeof_value), %rax // a = a - diff;
@@ -284,7 +318,7 @@ je loop_end
 loop_e48h1:
     movq %r8, %r9
     movq %r8, %r12
-    shrq $packC_unit_log, %r9
+    shrq $(packC_unit_log), %r9
     andq $15, %r12 // ih % packC_unit
     leaq (%rcx, %r12, sizeof_value), %r12
     imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@@ -433,15 +467,37 @@ loop_e48h1_end:
 
 loop_end:
 
-popq %r15
-popq %r14
-popq %r13
-popq %r12
-popq %r9
-popq %r8
-popq %rbx
-popq %rax
-popq %rbp
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %r9
+popq    %r8
+popq    %rbx
+popq    %rax
+#endif
+
+popq    %rbp
 
 retq
 
diff --git a/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S b/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S
index 15b497e2..ab730d30 100644
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S
@@ -21,7 +21,6 @@ pushq   %rbp
 movq    %rsp, %rbp
 
 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -31,7 +30,17 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@@ -179,6 +188,17 @@ Loop:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
index de252fb4..47d1d8d5 100644
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
@@ -19,7 +19,8 @@ pushq   %rbp
 movq    %rsp, %rbp
 
 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32)
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -29,6 +30,17 @@ movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
 movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@@ -216,6 +228,17 @@ LoopDz:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S
index e866bdb7..22e541aa 100644
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S
@@ -19,7 +19,8 @@ pushq   %rbp
 movq    %rsp, %rbp
 
 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32)
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@@ -29,6 +30,17 @@ movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
 movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@@ -191,6 +203,17 @@ LoopDz:
 End:
 
 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S
index 8cafcd72..74fe0857 100644
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S
@@ -18,12 +18,41 @@ asm_function _AVX_MNNGemmFloatUnitMainFMA_Fused
 pushq   %rbp
 movq    %rsp, %rbp
 
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r8
+movq (push_registers_bytes + 8)(%rsp), %r9
 pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
 movq %r8, %r14
 movq %r9, %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+movq %r8, %r14
+movq %r9, %r15
+#endif
 
 movq 40(%rcx), %r10 // bExtraStride
 movq 24(%rcx), %r8 // cStride
@@ -232,10 +261,30 @@ LoopDz:
 
 End:
 
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
 popq    %r12
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+#endif
 popq    %rbp
 
 retq
diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S
index 2dec6291..66fbc798 100644
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S
@@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1NFMA_ASM
 
 
 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
 
 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@@ -42,6 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif
 
 
 movq (%rdi),    %rax    // %rax C
@@ -203,6 +228,26 @@ LoopE24H1:
             jmp LoopE24H1
 
 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@@ -211,6 +256,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq
 
diff --git a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S
index 9d97066f..e953d19d 100644
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S
@@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4NFMA_ASM
 
 
 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters
 
 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@@ -42,6 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif
 
 movq (%rdi),    %rax        // %rax C
 movq 8(%rdi),   %rbx        // %rbx A
@@ -195,6 +220,26 @@ LoopE24H4:
             jmp LoopE24H4
     
 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@@ -203,6 +248,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq
 
diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
index 027051b3..53abc702 100644
--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@@ -56,15 +56,15 @@ message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!")
 
 if(WIN32)
     cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC})
-    string(REPLACE "cublas.lib" "cudnn.lib" CUDNN_LIBRARIES ${CUDA_CUBLAS_LIBRARIES})
-    set(MNN_CUDA_LIBS MNN_CUDA ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES} PARENT_SCOPE)
+    set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE)
 else()
     cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
-    set(MNN_CUDA_LIBS MNN_Cuda_Main cudnn cublas PARENT_SCOPE)
+    set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE)
     add_library(MNN_CUDA OBJECT Register.cpp)
 endif()
 
 include_directories(
+    ${CMAKE_CURRENT_LIST_DIR}/
     ${CUDA_INCLUDE_DIRS}
     ${CMAKE_SOURCE_DIR}/include/
 )
diff --git a/source/backend/cuda/core/CUDABackend.cpp b/source/backend/cuda/core/CUDABackend.cpp
index 5e26acdb..8e5cc7e6 100644
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@@ -14,6 +14,11 @@
 #include "core/Macro.h"
 #include "shape/SizeComputer.hpp"
 #include "core/TensorUtils.hpp"
+#include "execution/Raster.cuh"
+#include "execution/Transpose.cuh"
+#include "execution/MNNCUDADefine.hpp"
+
+// #define MNN_CUDA_COPY_DEBUG
 
 namespace MNN {
 namespace CUDA {
@@ -30,22 +35,18 @@ public:
         // Do nothing
     }
     virtual ~ CUDARuntimeAllocator() = default;
-    virtual std::pair<void*, int> onAlloc(int size, int align) override {
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
         return std::make_pair(mRuntime->alloc(size), 0);
     }
-    virtual void onRelease(std::pair<void*, int> ptr) override {
+    virtual void onRelease(std::pair<void*, size_t> ptr) override {
         mRuntime->free(ptr.first);
     }
 private:
     CUDARuntime* mRuntime;
 };
 CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
-    // Shader precision
-    if (precision == BackendConfig::Precision_Low) {
-        mCUDARuntime.reset(new CUDARuntime(true, -1));
-    } else {
-        mCUDARuntime.reset(new CUDARuntime(false, -1));
-    }
+    // TODO: Search CUDA Device info and use best one
+    mCUDARuntime.reset(new CUDARuntime(-1));
     if (mCUDARuntime.get()) {
         if (mCUDARuntime->isCreateError() == true) {
             mIsCreateError = true;
@@ -54,6 +55,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
         std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
         mBufferPool.reset(new BufferAllocator(allocator));
     }
+    mDefaultPrecision = precision;
 }
 CUDARuntimeWrapper::~CUDARuntimeWrapper() {
     // Do nothing
@@ -64,7 +66,12 @@ float CUDARuntimeWrapper::onGetMemoryInMB() {
 }
 
 Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
-    return new CUDABackend(mBufferPool, mCUDARuntime);
+    auto mode = mDefaultPrecision;
+    if (nullptr != config) {
+        mode = config->precision;
+    }
+    bool useFp16 = mode == BackendConfig::Precision_Low;
+    return new CUDABackend(mBufferPool, mCUDARuntime, useFp16);
 }
 
 void CUDARuntimeWrapper::onGabageCollect(int level) {
@@ -72,11 +79,12 @@ void CUDARuntimeWrapper::onGabageCollect(int level) {
 }
 
 CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
-                         std::shared_ptr<CUDARuntime> rt)
+                         std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32)
     : Backend(MNN_FORWARD_CUDA) {
     mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
     mStaticBufferPool = st;
     mCUDARuntime      = rt;
+    mUseFp16AsFp32 = useFp16AsFp32;
 }
 
 CUDABackend::~CUDABackend() {
@@ -89,6 +97,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() {
     MNN_ASSERT(nullptr != mCUDARuntime.get());
     return mCUDARuntime.get();
 }
+bool CUDABackend::useFp16() const {
+    return mUseFp16AsFp32;
+}
 
 class CUDAMemObj : public Backend::MemObj {
 public:
@@ -103,12 +114,27 @@ private:
     BufferAllocator* mAllocator;
     std::pair<void*, int> mPoint;
 };
+int CUDABackend::getBytes(const Tensor* tensor) const {
+    auto bytes = tensor->getType().bytes();
+    if (mUseFp16AsFp32) {
+        if (halide_type_float == tensor->getType().code) {
+            bytes = 2;
+        }
+    }
+    return bytes;
+}
+CPUResizeCache* CUDABackend::getCache() {
+    return &mCache;
+}
+
 Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
 #endif
     BufferAllocator* allocator = nullptr;
-    int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
+    auto bytes = getBytes(nativeTensor);
+    size_t mallocSize = realSize(nativeTensor) * bytes;
+
     std::pair<void*, int> buffer;
     if (storageType == DYNAMIC_SEPERATE) {
         buffer                              = mBufferPool->alloc(mallocSize, true);
@@ -132,13 +158,23 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
 }
 
 bool CUDABackend::onClearBuffer() {
+    mCache.reset();
     mBufferPool->release(true);
     return true;
 }
 size_t CUDABackend::realSize(const Tensor* tensor) {
+    auto dim = TensorUtils::getDescribe(tensor)->dimensionFormat;
+    int pack = 1;
+    if (dim == MNN_DATA_FORMAT_NC4HW4) {
+        pack = PACK_NUMBER;
+    }
     size_t res = 1;
     for (int i = 0; i < tensor->dimensions(); ++i) {
-        res *= tensor->length(i);
+        size_t l = tensor->length(i);
+        if (1 == i ) {
+            l = UP_DIV(l, pack) * pack;
+        }
+        res *= l;
     }
     return res;
 }
@@ -186,47 +222,332 @@ void CUDABackend::onExecuteBegin() const {
 
 void CUDABackend::onExecuteEnd() const {
 }
+static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
+    if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        srcStride[0] = plane * srcPack;
+        srcStride[1] = plane * batch * PACK_NUMBER;
+        srcStride[2] = srcPack;
+    } else if (srcDimensionFormat == MNN_DATA_FORMAT_NCHW) {
+        srcStride[0] = channel * plane;
+        srcStride[1] = plane * PACK_NUMBER;
+        srcStride[2] = 1;
+    } else {
+        srcStride[0] = channel * plane;
+        srcStride[1] = PACK_NUMBER;
+        srcStride[2] = channel;
+    }
+}
+
+static void _computeBCA(int& batch, int& plane, int& channel, MNN_DATA_FORMAT srcDimensionFormat, const Tensor* srcTensor) {
+    if (srcDimensionFormat != MNN_DATA_FORMAT_NHWC) {
+        batch = srcTensor->length(0);
+        channel = srcTensor->length(1);
+        plane = 1;
+        for (int i=2; i<srcTensor->dimensions(); ++i) {
+            plane *= srcTensor->length(i);
+        }
+    } else {
+        batch = srcTensor->length(0);
+        channel = srcTensor->length(srcTensor->dimensions()-1);
+        plane = 1;
+        for (int i=1; i<srcTensor->dimensions()-1; ++i) {
+            plane *= srcTensor->length(i);
+        }
+    }
+}
+
+static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch, int plane, int channel) {
+    PackInfo pack;
+    pack.inside = plane;
+    pack.axis = channel;
+    pack.unit = PACK_NUMBER;
+    pack.outside = batch;
+    if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+        pack.axisStride = 1;
+        pack.insideStride = channel;
+    } else {
+        pack.axisStride = plane;
+        pack.insideStride = 1;
+    }
+    return pack;
+}
 
 void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
     auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
-    auto srcDevice          = srcTensor->deviceId() != 0;
-
     auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    auto srcDevice          = srcTensor->deviceId() != 0;
     auto dstDevice          = dstTensor->deviceId() != 0;
-    if (srcDevice && srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        srcDimensionFormat = MNN_DATA_FORMAT_NCHW;
+    MNN_ASSERT(srcDevice || dstDevice);
+    uint8_t* srcPtr = nullptr;
+    std::pair<void*, int> tempSrcStorage;
+    auto bytes = getBytes(srcTensor);
+    auto type = srcTensor->getType();
+#ifdef MNN_CUDA_COPY_DEBUG
+    MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat);
+    for (int i=0; i<srcTensor->dimensions(); ++i) {
+        MNN_PRINT("%d ", srcTensor->length(i));
     }
-    if (dstDevice && dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        dstDimensionFormat = MNN_DATA_FORMAT_NCHW;
+    MNN_PRINT("]\n");
+#endif
+    bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
+    if (mUseFp16AsFp32) {
+        if ((!srcDevice) || (!dstDevice)) {
+            if (type.code == halide_type_float) {
+                directCopy = false;
+            }
+        }
     }
-    auto needSize = realSize(srcTensor) * srcTensor->getType().bytes();
-    std::shared_ptr<Tensor> srcTempTensor;
-    std::shared_ptr<Tensor> dstTempTensor;
-    
-    if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
-        mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize,
-                            MNNMemcpyDeviceToDevice, true);
+    if (directCopy) {
+        auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
+        if (srcDevice && dstDevice) {
+            mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
+                                MNNMemcpyDeviceToDevice, true);
+        } else if (srcDevice && (!dstDevice)) {
+            mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
+                                MNNMemcpyDeviceToHost, true);
+        } else if ((!srcDevice) && (dstDevice)) {
+            mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
+                                MNNMemcpyHostToDevice, true);
+        }
+        return;
+    }
+    if (!srcDevice) {
+        auto cpuSize = srcTensor->size();
+        tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
+        srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
+        mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
+                             true);
+    } else {
+        srcPtr = (uint8_t*)srcTensor->deviceId();
+    }
+    uint8_t* dstPtr = nullptr;
+    std::pair<void*, int> tempDstStorage;
+    if (!dstDevice) {
+        auto cpuSize = dstTensor->size();
+        tempDstStorage = mStaticBufferPool->alloc(cpuSize);
+        dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
+    } else {
+        dstPtr = (uint8_t*)dstTensor->deviceId();
     }
-    if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) {
-        if(srcDimensionFormat != dstDimensionFormat) {
 
-            dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true));
-            mCUDARuntime->memcpy(dstTempTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
-                             true);
-            MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor);
+    // Format convert
+    FuseRegion reg;
+    int* size = reg.size;
+    int* srcStride = reg.srcStride;
+    int* dstStride = reg.dstStride;
+    int offset[PACK_NUMBER * 8];
+    int offsetNumber = 0;
+    auto offsetGpuStorage = mStaticBufferPool->alloc(PACK_NUMBER * 8 * sizeof(int));
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+    auto regionStorage = mStaticBufferPool->alloc(sizeof(FuseRegion));
+    auto regionGpu = (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second);
+
+    do {
+        if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
+            if (srcTensor->dimensions() <= 1 || srcDimensionFormat == dstDimensionFormat) {
+                auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
+                mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
+                                    MNNMemcpyDeviceToDevice, true);
+            } else {
+                int batch, plane, channel;
+                _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
+                PackInfo pack;
+                auto func = PackBuffer;
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
+                    func = PackBuffer;
+                } else if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
+                    func = UnpackBuffer;
+                } else {
+                    FUNC_PRINT(1);
+                }
+                func((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), &pack, getBytes(srcTensor), mCUDARuntime.get());
+            }
+            break;
+        }
+        auto convertFunction = FuseRasterBlitFloatToFloat;
+        if (mUseFp16AsFp32) {
+            if (!srcDevice) {
+                convertFunction = FuseRasterBlitFloatToHalf;
+            } else {
+                convertFunction = FuseRasterBlitHalfToFloat;
+            }
+        }
+        if (srcTensor->dimensions() <= 1) {
+            size[2] = srcTensor->elementSize();
+            srcStride[2] = 1;
+            dstStride[2] = 1;
+            offset[0] = 1;
+            offset[1] = 1;
+            offset[2] = size[2];
+            offset[3] = 0;
+            offset[4] = 1;
+            offset[5] = 1;
+            offset[6] = size[2];
+            offset[7] = 0;
+            offsetNumber = 1;
         } else {
-            mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
-                             true);
+            // Compute batch, plane, channel
+            int batch, plane, channel;
+            _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
+            if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDevice) {
+                PackInfo pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
+                if (mUseFp16AsFp32) {
+                    if (type.code == halide_type_float) {
+                        if (dstDevice) {
+                            PackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        } else {
+                            PackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        }
+                    }
+                } else {
+                    PackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
+                }
+                break;
+            }
+            if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && srcDevice) {
+                PackInfo pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
+                if (mUseFp16AsFp32) {
+                    if (type.code == halide_type_float) {
+                        if (dstDevice) {
+                            UnpackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        } else {
+                            UnpackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        }
+                    }
+                } else {
+                    UnpackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
+                }
+                break;
+            }
+            //MNN_PRINT("host/device: %d -> %d, format %d -> %d, b, p, c: %d - %d - %d\n", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat, batch, plane, channel);
+            // Set region
+            if (srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                size[0] = batch;
+                size[1] = channel;
+                size[2] = plane;
+                offsetNumber = 1;
+                offset[0] = batch;
+                offset[1] = channel;
+                offset[2] = plane;
+                offset[3] = 0;
+                offset[4] = batch;
+                offset[5] = channel;
+                offset[6] = plane;
+                offset[7] = 0;
+                if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+                    srcStride[0] = channel * plane;
+                    srcStride[1] = 1;
+                    srcStride[2] = channel;
+                } else {
+                    srcStride[0] = channel * plane;
+                    srcStride[1] = plane;
+                    srcStride[2] = 1;
+                }
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+                    dstStride[0] = channel * plane;
+                    dstStride[1] = 1;
+                    dstStride[2] = channel;
+                } else {
+                    dstStride[0] = channel * plane;
+                    dstStride[1] = plane;
+                    dstStride[2] = 1;
+                }
+            } else {
+                offsetNumber = PACK_NUMBER;
+                size[0] = batch;
+                size[1] = UP_DIV(channel, PACK_NUMBER);
+                size[2] = plane;
+                int srcPack = 1;
+                int dstPack = 1;
+                int srcChannelLimit = channel;
+                if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    if (srcDevice) {
+                        srcPack = PACK_NUMBER;
+                        srcChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
+                    } else {
+                        srcPack = 4;
+                        srcChannelLimit = UP_DIV(channel, 4) * 4;
+                    }
+                }
+                int dstChannelLimit = channel;
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    if (dstDevice) {
+                        dstPack = PACK_NUMBER;
+                        dstChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
+                    } else {
+                        dstPack = 4;
+                        dstChannelLimit = UP_DIV(channel, 4) * 4;
+                    }
+                }
+                // Compute Stride
+                _computeStride(srcDimensionFormat, srcStride, batch, plane, channel, srcPack);
+                _computeStride(dstDimensionFormat, dstStride, batch, plane, channel, dstPack);
+
+                // Compute Offset
+                for (int i=0; i<offsetNumber; ++i) {
+                    auto offsetPtr = offset + i * 8;
+                    int channelStart = i;
+                    offsetPtr[0] = batch;
+                    offsetPtr[1] = (srcChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
+                    offsetPtr[2] = plane;
+                    if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                        int sp = i / srcPack;
+                        int sm = i % srcPack;
+                        offsetPtr[3] = sm + sp * srcPack * plane * batch;
+                    } else {
+                        offsetPtr[3] = channelStart * srcStride[1] / PACK_NUMBER;
+                    }
+
+                    offsetPtr[4] = batch;
+                    offsetPtr[5] = (dstChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
+                    offsetPtr[6] = plane;
+                    if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                        int sp = i / dstPack;
+                        int sm = i % dstPack;
+                        offsetPtr[7] = sm + sp * dstPack * plane * batch;
+                    } else {
+                        offsetPtr[7] = channelStart * dstStride[1] / PACK_NUMBER;
+                    }
+                }
+            }
         }
+        reg.fuseNumber = offsetNumber;
+        mCUDARuntime->memcpy(regionGpu, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+        mCUDARuntime->memcpy(offsetGpu, offset, offsetNumber * 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+#ifdef MNN_CUDA_COPY_DEBUG
+        MNN_PRINT("Reg.size: %d - %d - %d\n", reg.size[0], reg.size[1], reg.size[2]);
+        MNN_PRINT("Reg.srcStride: %d - %d - %d\n", reg.srcStride[0], reg.srcStride[1], reg.srcStride[2]);
+        MNN_PRINT("Reg.dstStride: %d - %d - %d\n", reg.dstStride[0], reg.dstStride[1], reg.dstStride[2]);
+        MNN_PRINT("FuseNum: %d\n", reg.fuseNumber);
+        for (int i=0; i<reg.fuseNumber; ++i) {
+            auto off = offset + 8 * i;
+            MNN_PRINT("Src: %d, %d, %d, %d; dst:%d, %d, %d, %d\n", off[0], off[1], off[2], off[3], off[4], off[5], off[6], off[7]);
+        }
+#endif
+        if (mUseFp16AsFp32) {
+            if (type.code == halide_type_float) {
+                convertFunction(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get());
+                break;
+            }
+        }
+        FuseRasterBlitCommon(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get(), type.bytes());
+    } while(false);
+    mStaticBufferPool->free(offsetGpuStorage);
+    mStaticBufferPool->free(regionStorage);
+    if (!srcDevice) {
+        mStaticBufferPool->free(tempSrcStorage);
     }
-    if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
-        if (srcDimensionFormat != dstDimensionFormat) {
-            srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true));
-            MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
-            srcTensor = srcTempTensor.get();
-        }
-        mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host<void>(), needSize, MNNMemcpyHostToDevice,
+    if (!dstDevice) {
+        auto cpuSize = dstTensor->size();
+        mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
                              true);
+        mStaticBufferPool->free(tempDstStorage);        
     }
     return;
 }
diff --git a/source/backend/cuda/core/CUDABackend.hpp b/source/backend/cuda/core/CUDABackend.hpp
index 86e01a5c..4fb0c50b 100644
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@@ -17,6 +17,7 @@
 #include "core/Macro.h"
 #include "core/ConvolutionCommon.hpp"
 #include "core/BufferAllocator.hpp"
+#include "backend/cpu/CPUResizeCache.hpp"
 namespace MNN {
 namespace CUDA {
 class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
@@ -37,11 +38,12 @@ private:
     std::shared_ptr<BufferAllocator> mBufferPool;
     std::shared_ptr<CUDARuntime> mCUDARuntime; 
     bool mIsCreateError{false};
+    BackendConfig::PrecisionMode mDefaultPrecision;
 };
 
 class CUDABackend : public Backend {
 public:
-    CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt);
+    CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32);
     ~CUDABackend();
 
     CUDARuntime *getCUDARuntime();
@@ -74,11 +76,15 @@ public:
         return mStaticBufferPool.get();
     }
     static size_t realSize(const Tensor *tensor);
-
+    int getBytes(const Tensor* tensor) const;
+    CPUResizeCache* getCache();
+    bool useFp16() const;
 private:
     std::shared_ptr<BufferAllocator> mBufferPool;
     std::shared_ptr<BufferAllocator> mStaticBufferPool;
     std::shared_ptr<CUDARuntime> mCUDARuntime;
+    CPUResizeCache mCache;
+    bool mUseFp16AsFp32 = false;
 };
 
 template <class T>
diff --git a/source/backend/cuda/core/runtime/CUDARuntime.cpp b/source/backend/cuda/core/runtime/CUDARuntime.cpp
index 10b17da1..72c78fd8 100644
--- a/source/backend/cuda/core/runtime/CUDARuntime.cpp
+++ b/source/backend/cuda/core/runtime/CUDARuntime.cpp
@@ -15,17 +15,11 @@
 #include <utility>
 #include <vector>
 #include "core/Macro.h"
+// #define MNN_CUDA_USE_BLAS
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
-// #define LOG_VERBOSE
-#define CUDNN_VERSION_STR STR(CUDNN_MAJOR) "." STR(CUDNN_MINOR) "." STR(CUDNN_PATCHLEVEL)
-
-#pragma message "compile with cuda " STR(CUDART_VERSION) " "
-#pragma message "compile with cuDNN " CUDNN_VERSION_STR " "
-
-static_assert(!(CUDNN_MAJOR == 5 && CUDNN_MINOR == 1), "cuDNN 5.1.x series has bugs. Use 5.0.x instead.");
 
 #undef STR
 #undef STR_HELPER
@@ -36,7 +30,7 @@ bool CUDARuntime::isCreateError() const {
     return mIsCreateError;
 }
 
-CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
+CUDARuntime::CUDARuntime(int device_id) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("start CUDARuntime !\n");
 #endif
@@ -49,42 +43,39 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
     mDeviceId = id;
     cuda_check(cudaGetDeviceProperties(&mProp, id));
     MNN_ASSERT(mProp.maxThreadsPerBlock > 0);
-
+#ifdef MNN_CUDA_USE_BLAS
     cublas_check(cublasCreate(&mCublasHandle));
-
-    // Set stream for cuDNN and cublas handles.
-
-    // Note that all cublas scalars (alpha, beta) and scalar results such as dot
-    // output resides at device side.
     cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST));
-    cudnn_check(cudnnCreate(&mCudnnHandle));
+#endif
 }
 
 CUDARuntime::~CUDARuntime() {
 #ifdef LOG_VERBOSE
     MNN_PRINT("start ~CUDARuntime !\n");
 #endif
+#ifdef MNN_CUDA_USE_BLAS
     cublas_check(cublasDestroy(mCublasHandle));
-    cudnn_check(cudnnDestroy(mCudnnHandle));
-
+#endif
 #ifdef LOG_VERBOSE
     MNN_PRINT("end ~CUDARuntime !\n");
 #endif
 }
 
-int CUDARuntime::blocks_num(const int total_threads) {
-    int maxNum = mProp.maxThreadsPerBlock;
-    if(total_threads / 32 > maxNum) {
-        mThreadPerBlock = maxNum;
-    } else if(total_threads / 16 > maxNum) {
-        mThreadPerBlock = maxNum / 2;
-    } else if(total_threads / 8 > maxNum) {
-        mThreadPerBlock = maxNum / 4;
-    } else if(total_threads / 4 > maxNum) {
-        mThreadPerBlock = maxNum / 8;
-    } else {
-        mThreadPerBlock = 128;
-    }
+size_t CUDARuntime::blocks_num(const size_t total_threads) {
+    // size_t maxNum = mProp.maxThreadsPerBlock;
+    // if(total_threads / 32 > maxNum) {
+    //     mThreadPerBlock = maxNum;
+    // } else if(total_threads / 16 > maxNum) {
+    //     mThreadPerBlock = maxNum / 2;
+    // } else if(total_threads / 8 > maxNum) {
+    //     mThreadPerBlock = maxNum / 4;
+    // } else if(total_threads / 4 > maxNum) {
+    //     mThreadPerBlock = maxNum / 8;
+    // } else {
+    //     mThreadPerBlock = 128;
+    // }
+    
+    mThreadPerBlock = 128;
     return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
 }
 
@@ -148,13 +139,4 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe
 void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) {
     cuda_check(cudaMemset(dst, value, size_in_bytes));
 }
-
-cublasHandle_t CUDARuntime::cublas_handle() {
-    return mCublasHandle;
-}
-
-cudnnHandle_t CUDARuntime::cudnn_handle() {
-    return mCudnnHandle;
-}
-
 } // namespace MNN
diff --git a/source/backend/cuda/core/runtime/CUDARuntime.hpp b/source/backend/cuda/core/runtime/CUDARuntime.hpp
index 1594ed60..f217c031 100644
--- a/source/backend/cuda/core/runtime/CUDARuntime.hpp
+++ b/source/backend/cuda/core/runtime/CUDARuntime.hpp
@@ -16,19 +16,14 @@
 #include <string>
 #include <vector>
 
-#include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <cudnn.h>
 #include <cusolverDn.h>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "Type_generated.h"
 #include "core/Macro.h"
-#if CUDA_VERSION >= 10010
-#include <cublasLt.h>
-#endif
 
 typedef enum {
     CUDA_FLOAT32 = 0,
@@ -49,40 +44,30 @@ typedef enum {
         }                          \
     } while (0)
 
-#define cublas_check(_x)                     \
-    do {                                     \
-        cublasStatus_t _err = (_x);          \
-        if (_err != CUBLAS_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);            \
-        }                                    \
-    } while (0)
-
-#define cudnn_check(_x)                     \
-    do {                                    \
-        cudnnStatus_t _err = (_x);          \
-        if (_err != CUDNN_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);           \
-        }                                   \
-    } while (0)
-
-#define cusolver_check(_x)                     \
-    do {                                       \
-        cusolverStatus_t _err = (_x);          \
-        if (_err != CUSOLVER_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);              \
-        }                                      \
-    } while (0)
-
 #define after_kernel_launch()           \
     do {                                \
         cuda_check(cudaGetLastError()); \
     } while (0)
 
+#ifdef DEBUG
+#define checkKernelErrors\
+  do {                                                      \
+    cudaError_t __err = cudaGetLastError();                 \
+    if (__err != cudaSuccess) {                             \
+      printf("File:%s Line %d: failed: %s\n", __FILE__, __LINE__,\
+             cudaGetErrorString(__err));                    \
+      abort();                                              \
+    }                                                       \
+  } while (0)
+#else
+#define checkKernelErrors
+#endif
+
 namespace MNN {
 
 class CUDARuntime {
 public:
-    CUDARuntime(bool permitFloat16, int device_id);
+    CUDARuntime(int device_id);
     ~CUDARuntime();
     CUDARuntime(const CUDARuntime &) = delete;
     CUDARuntime &operator=(const CUDARuntime &) = delete;
@@ -105,16 +90,14 @@ public:
 
     void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false);
     void memset(void *dst, int value, size_t size_in_bytes);
-    cublasHandle_t cublas_handle();
-    cudnnHandle_t cudnn_handle();
 
-    int threads_num() {
+    size_t threads_num() {
         return mThreadPerBlock;
     }
     int major_sm() const {
         return mProp.major;
     }
-    int blocks_num(const int total_threads);
+    size_t blocks_num(const size_t total_threads);
     const cudaDeviceProp& prop() const {
         return mProp;
     }
@@ -123,15 +106,12 @@ private:
     cudaDeviceProp mProp;
     int mDeviceId;
 
-    cublasHandle_t mCublasHandle;
-    cudnnHandle_t mCudnnHandle;
-
     bool mIsSupportedFP16   = false;
     bool mSupportDotInt8    = false;
     bool mSupportDotAccInt8 = false;
     float mFlops            = 4.0f;
     bool mIsCreateError{false};
-    int mThreadPerBlock = 128;
+    size_t mThreadPerBlock = 128;
 };
 
 } // namespace MNN
diff --git a/source/backend/cuda/execution/BatchMatMulExecution.cu b/source/backend/cuda/execution/BatchMatMulExecution.cu
deleted file mode 100644
index dc6d235a..00000000
--- a/source/backend/cuda/execution/BatchMatMulExecution.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-#include "BatchMatMulExecution.hpp"
-namespace MNN {
-namespace CUDA {
-
-template <typename T>
-__global__ void add_bias(T *input, T *output, const T* bias, int batch, int e, int h) {
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) {
-        int i = index % (e*h);
-        int b = index / (e*h);
-        int y = i % h;
-        output[index] = input[index] + bias[b * h + y];
-    }
-    return;
-}
-BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) {
-    mTransposeA = transposeA;
-    mTransposeB = transposeB;
-}
-BatchMatMulExecution::~ BatchMatMulExecution() {
-    // do nothing
-}
-
-ErrorCode BatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto C = outputs[0];
-
-    auto dimensions = C->dimensions();
-    int batch = 1;
-    for (int i = 0; i < dimensions - 2; ++i) {
-        batch *= C->length(i);
-    }
-    auto e = C->length(dimensions-2);
-    auto h = C->length(dimensions-1);
-    if(inputs.size() > 2) {
-        mTempOutput.reset(Tensor::createDevice<float>({batch*h*e}));
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
-    }
-    return NO_ERROR;
-}
-
-ErrorCode BatchMatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    auto blasHandle = runtime->cublas_handle();
-    const Tensor* A = inputs[0];
-    const Tensor* B = inputs[1];
-
-    auto dimensions = A->dimensions();
-    int batch = 1;
-    for (int i = 0; i < dimensions - 2; ++i) {
-        batch *= A->length(i);
-    }
-
-    auto w0         = inputs[0]->length(dimensions-1);
-    auto h0         = inputs[0]->length(dimensions-2);
-    auto C = outputs[0];
-
-    auto e = C->length(dimensions-2);
-    auto h = C->length(dimensions-1);
-    auto l = w0;
-    if (mTransposeA) {
-        l = h0;
-    }
-    auto APtr = (const float*)A->deviceId();
-    auto BPtr = (const float*)B->deviceId();
-    auto CDestPtr = (float*)C->deviceId();
-
-    float alpha = 1.0f;
-    float beta = 0.0f;
-
-    auto tranB = CUBLAS_OP_N;
-    auto ldB = h;
-    if (mTransposeB) {
-        ldB = l;
-        tranB = CUBLAS_OP_T;
-    }
-    auto tranA = CUBLAS_OP_N;
-    auto ldA = l;
-    if (mTransposeA) {
-        ldA = e;
-        tranA = CUBLAS_OP_T;
-    }
-
-    // [b, e, l] x [b, l, h] -> [b, e, h]
-    if(inputs.size() == 2) {    
-        auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-
-    } else {
-        auto CPtr = (float*)mTempOutput->deviceId();
-        auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-
-        //add bias: [b, e, h] + [b, h] -> [b, e, h]
-        int block_num = runtime->blocks_num(batch*e*h);
-        int threads_num = runtime->threads_num();
-        add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h);
-    }
-
-    return NO_ERROR;
-}
-
-class BatchMatMulCreator : public CUDABackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        auto param = op->main_as_BatchMatMulParam();
-        return new BatchMatMulExecution(param->adjX(), param->adjY(), backend);
-    }
-};
-
-static CUDACreatorRegister<BatchMatMulCreator> __init(OpType_BatchMatMul);
-
-}
-}
diff --git a/source/backend/cuda/execution/BatchMatMulExecution.hpp b/source/backend/cuda/execution/BatchMatMulExecution.hpp
deleted file mode 100644
index d3630d1b..00000000
--- a/source/backend/cuda/execution/BatchMatMulExecution.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef BatchMatMulExecution_hpp
-#define BatchMatMulExecution_hpp
-#include <vector>
-#include "backend/cuda/core/CUDABackend.hpp"
-#include "core/Execution.hpp"
-namespace MNN {
-namespace CUDA {
-class BatchMatMulExecution : public Execution {
-public:
-    BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend);
-    virtual ~BatchMatMulExecution();
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    std::shared_ptr<Tensor> mTempOutput;
-    bool mTransposeA;
-    bool mTransposeB;
-};
-} // namespace CUDA
-} // namespace MNN
-
-#endif
diff --git a/source/backend/cuda/execution/BinaryExecution.cu b/source/backend/cuda/execution/BinaryExecution.cu
index 77005f76..8f0ec238 100644
--- a/source/backend/cuda/execution/BinaryExecution.cu
+++ b/source/backend/cuda/execution/BinaryExecution.cu
@@ -50,11 +50,16 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int stride0[3] = {0, 0, s0};
     int stride1[3] = {0, 0, s1};
     int stride2[3] = {0, 0, 1};
+    auto type = outputs[0]->getType();
+    if (type.code == halide_type_float) {
+        // Use Half or float
+        type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
+    }
     auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
         auto input0 = (uint8_t*)input0T->deviceId();
         auto input1 = (uint8_t*)input1T->deviceId();
         auto output = (uint8_t*)outputT->deviceId();
-        BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, outputT->getType(), runtime, mType);
+        BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType);
     };
     computeFunction(inputs[0], inputs[1], outputs[0]);
     for (int i=2; i<inputs.size(); ++i) {
diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.cu b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
index dcd7f875..9e12f314 100755
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
@@ -1,61 +1,324 @@
 #include "ConvDepthWiseExecution.hpp"
 #include "core/ConvolutionCommon.hpp"
+#include "Raster.cuh"
+#include <float.h>
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {
-struct constBuffer {
-    int pad[2];
-    int kernelSize[2];
-    int stride[2];
-    int dilate[2];
-    int inputSize[2];
-    int outputSize[2];
-    int channel;
-    int subChannel;
-    int total;
-    int activationType;
-} uConstant;
+#define PACK_NUMBER_C2 (PACK_NUMBER/2)
 
-ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn) : Execution(bn) {
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#define MNN_CUDA_HALF2_MIN(a, b)                     \
+    do {                                             \
+        (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+
+__global__ void CONV_DW_HALF(const half2* input, const half2* kernel, const half2* bias, half2 *output, const constBuffer* uConstant) {
+    half2 maxV = half2(uConstant->maxValue, uConstant->maxValue);
+    half2 minV = half2(uConstant->minValue, uConstant->minValue);
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int c = uConstant->channel;
+    int ow = uConstant->outputSize[0];
+    int oh = uConstant->outputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int dw = uConstant->dilate[0];
+    int dh = uConstant->dilate[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index / PACK_NUMBER_C2;
+        int zR = index % PACK_NUMBER_C2;
+        int oz = i / (ow * oh);
+        int tmp = i % (ow * oh);
+        int oy = tmp / ow;
+        int ox = tmp % ow;
+        int kz = oz / uConstant->batch;
+        
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        half2 color = bias[kz * PACK_NUMBER_C2 + zR];
+        int fxSta = max(0, (UP_DIV(-ix, dw)));
+        int fySta = max(0, (UP_DIV(-iy, dh)));
+        int fxEnd = min(kw, UP_DIV(iw - ix, dw));
+        int fyEnd = min(kh, UP_DIV(ih - iy, dh));
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy*dh + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx*dw + ix;
+                half2 inp = input[0
+                    + sx * PACK_NUMBER_C2
+                    + sy * iw * PACK_NUMBER_C2
+                    + oz * iw * ih * PACK_NUMBER_C2
+                    + zR
+                ];
+                half2 ker = kernel[0
+                    + fx * PACK_NUMBER_C2
+                    + fy * kw * PACK_NUMBER_C2
+                    + kz * kw * kh * PACK_NUMBER_C2
+                    + zR
+                ];
+                color = __hfma2(inp, ker, color);
+            }
+        }
+        MNN_CUDA_HALF2_MAX(color, minV);
+        MNN_CUDA_HALF2_MIN(color, maxV);
+
+        output[0
+            + zR
+            + ox * PACK_NUMBER_C2
+            + oy * ow * PACK_NUMBER_C2
+            + oz * ow * oh * PACK_NUMBER_C2
+        ] = color;
+    }
+}
+
+
+__global__ void CONV_DW(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant) {
+    float maxV = uConstant->maxValue;
+    float minV = uConstant->minValue;
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int c = uConstant->channel;
+    int ow = uConstant->outputSize[0];
+    int oh = uConstant->outputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int dw = uConstant->dilate[0];
+    int dh = uConstant->dilate[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index / PACK_NUMBER;
+        int zR = index % PACK_NUMBER;
+        int oz = i / (ow * oh);
+        int tmp = i % (ow * oh);
+        int oy = tmp / ow;
+        int ox = tmp % ow;
+        int kz = oz / uConstant->batch;
+        
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        float color = bias[kz * PACK_NUMBER + zR];
+        int fxSta = max(0, (UP_DIV(-ix, dw)));
+        int fySta = max(0, (UP_DIV(-iy, dh)));
+        int fxEnd = min(kw, UP_DIV(iw - ix, dw));
+        int fyEnd = min(kh, UP_DIV(ih - iy, dh));
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy*dh + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx*dw + ix;
+                float inp = input[0
+                    + sx * PACK_NUMBER
+                    + sy * iw * PACK_NUMBER
+                    + oz * iw * ih * PACK_NUMBER
+                    + zR
+                ];
+                float ker = kernel[0
+                    + fx * PACK_NUMBER
+                    + fy * kw * PACK_NUMBER
+                    + kz * kw * kh * PACK_NUMBER
+                    + zR
+                ];
+                color = color + inp * ker;
+            }
+        }
+        color = max(color, minV);
+        color = min(color, maxV);
+
+        output[0
+            + zR
+            + ox * PACK_NUMBER
+            + oy * ow * PACK_NUMBER
+            + oz * ow * oh * PACK_NUMBER
+        ] = color;
+    }
+}
+
+
+__global__ void CONV_DW_OPT(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant,
+    DivModFast d_owh,
+    DivModFast d_ow,
+    DivModFast d_ob
+    ) {
+    float maxV = uConstant->maxValue;
+    float minV = uConstant->minValue;
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index >> 4;
+        int zR = index & 15;
+        int oz, tmp, oy, ox, kz, unuse;
+        d_owh.divmod(i, oz, tmp);
+        d_ow.divmod(tmp, oy, ox);
+        d_ob.divmod(oz, kz, unuse);
+
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        float color = bias[(kz << 4) + zR];
+        int fxSta = max(0, -ix);
+        int fySta = max(0, -iy);
+        int fxEnd = min(kw, iw - ix);
+        int fyEnd = min(kh, ih - iy);
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx + ix;
+                float inp = input[0
+                    + ((sx + iw * (sy + oz * ih)) << 4)
+                    + zR
+                ];
+                float ker = kernel[0
+                    + ((fx + kw * (fy + kz * kh)) << 4)
+                    + zR
+                ];
+                color = color + inp * ker;
+            }
+        }
+        color = max(color, minV);
+        color = min(color, maxV);
+
+        output[index] = color;
+    }
+    return;
+}
+
+static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op* op, Backend* bn) {
+    std::shared_ptr<ConvDepthWiseExecution::Resource> res(new ConvDepthWiseExecution::Resource);
+    auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+    auto conv = op->main_as_Convolution2D();
+    auto convCommon = conv->common();
+    int kernelX = convCommon->kernelX();
+    int kernelY = convCommon->kernelY();
+    int depth = convCommon->outputCount();
+    int depthC = UP_DIV(depth, PACK_NUMBER);
+    res->weightTensor.reset(Tensor::createDevice<float>({kernelX * kernelY * depthC * PACK_NUMBER}));
+    bool success = bn->onAcquireBuffer(res->weightTensor.get(), Backend::STATIC);
+    if (!success) {
+        return nullptr;
+    }
+    res->mFilter = (void *)res->weightTensor.get()->buffer().device;
+    FuseRegion reg;
+    int offset[8 * PACK_NUMBER];
+    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
+    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+    //weight host->device
+    const float* filterDataPtr = nullptr;
+    int weightSize = 0;
+    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
+    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    auto tempWeightStorage = pool->alloc(weightSize * sizeof(float));
+    auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
+    cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
+    reg.size[0] = 1;
+    reg.size[1] = depthC;
+    reg.size[2] = kernelX * kernelY;
+    reg.srcStride[0] = 0;
+    reg.srcStride[1] = PACK_NUMBER * kernelX * kernelY;
+    reg.srcStride[2] = 1;
+    reg.dstStride[0] = 0;
+    reg.dstStride[1] = kernelX * kernelY * PACK_NUMBER;
+    reg.dstStride[2] = PACK_NUMBER;
+    reg.fuseNumber = PACK_NUMBER;
+    for (int v=0; v<PACK_NUMBER; ++v) {
+        auto off = offset + 8 * v;
+        // Src
+        off[0] = 1;
+        off[1] = (depth + PACK_NUMBER - v - 1) / PACK_NUMBER;
+        off[2] = reg.size[2];
+        off[3] = v * kernelX * kernelY;
+        // Dst
+        off[4] = 1;
+        off[5] = depthC;
+        off[6] = reg.size[2];
+        off[7] = v;
+    }
+    runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+    runtime->memcpy(offsetGpu, offset, 8 * PACK_NUMBER * sizeof(int), MNNMemcpyHostToDevice, true);
+    FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    pool->free(tempWeightStorage);
+    res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
+    success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
+    res->mBias = (void *)res->biasTensor.get()->buffer().device;
+    if (!success) {
+        return nullptr;
+    }
+    if(conv->bias() != nullptr) {
+        auto tempBiasStorage = pool->alloc(depth * sizeof(float));
+        auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
+        cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+        reg.size[0] = 1;
+        reg.size[1] = 1;
+        reg.size[2] = depthC * PACK_NUMBER;
+        reg.srcStride[0] = 0;
+        reg.srcStride[1] = 0;
+        reg.srcStride[2] = 1;
+        reg.dstStride[0] = 0;
+        reg.dstStride[1] = 0;
+        reg.dstStride[2] = 1;
+        offset[0] = 1;
+        offset[1] = 1;
+        offset[2] = conv->bias()->size();
+        offset[3] = 0;
+        offset[4] = 1;
+        offset[5] = 1;
+        offset[6] = reg.size[2];
+        offset[7] = 0;
+        reg.fuseNumber = 1;
+        runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+        runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+        FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+        pool->free(tempBiasStorage);
+    }
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
+    return res;
+}
+
+ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn, std::shared_ptr<Resource> resource) : Execution(bn) {
     mOp = op;
+    mResource = resource;
     auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
     mConstBuffer = pool->alloc(sizeof(constBuffer));
-
-    auto conv = mOp->main_as_Convolution2D();
-    //weight host->device
-    if(nullptr != conv->weight()) {
-        int weightSize = conv->weight()->size();
-        weightTensor.reset(Tensor::createDevice<float>({weightSize}));
-        backend()->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
-        mFilter = (void *)weightTensor.get()->buffer().device;
-        cuda_check(cudaMemcpy(mFilter, conv->weight()->data(), conv->weight()->size()*sizeof(float), cudaMemcpyHostToDevice));
-
-        mBias = nullptr;
-        if(conv->bias()->size() != 0) {
-            int biasSize = conv->bias()->size();
-            biasTensor.reset(Tensor::createDevice<float>({biasSize}));
-            backend()->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-            mBias = (void *)biasTensor.get()->buffer().device;
-            cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-            use_bias_ = true;
-        }
-    }
 }
 ConvDepthWiseExecution::~ ConvDepthWiseExecution() {
     auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
     pool->free(mConstBuffer);
-    if (nullptr != weightTensor) {
-        backend()->onReleaseBuffer(weightTensor.get(), Backend::STATIC);
-    }
-    if(use_bias_ && nullptr != biasTensor) {
-        backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC);
-    }
 }
 
 ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mOp->main_as_Convolution2D()->common());
     auto conv = mOp->main_as_Convolution2D();
     auto convCommon = mOp->main_as_Convolution2D()->common();
-    constBuffer parameters;
+    int channel = inputs[0]->channel();
+    int channelDiv = UP_DIV(channel, PACK_NUMBER);
     parameters.pad[0] = pad.first;
     parameters.pad[1] = pad.second;
     parameters.kernelSize[0] = convCommon->kernelX();
@@ -66,233 +329,82 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
     parameters.dilate[1] = convCommon->dilateY();
     parameters.inputSize[0] = inputs[0]->width();
     parameters.inputSize[1] = inputs[0]->height();
-    parameters.channel = inputs[0]->batch() * inputs[0]->channel();
+    parameters.channel = inputs[0]->batch() * channelDiv;
     parameters.outputSize[0] = outputs[0]->width();
     parameters.outputSize[1] = outputs[0]->height();
-    parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
-    parameters.subChannel = inputs[0]->channel();
-    parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0);
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER_C2;
+    } else {
+        parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER;
+        parameters.minValue = -FLT_MAX;
+        parameters.maxValue = FLT_MAX;
+    }
+    parameters.batch = inputs[0]->batch();
+    if (convCommon->relu()) {
+        parameters.minValue = 0.0f;
+    }
+    if (convCommon->relu6()) {
+        parameters.minValue = 0.0f;
+        parameters.maxValue = 6.0f;
+    }
 
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
     mTotalCount = parameters.total;
-
+    //printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
     return NO_ERROR;
 }
 
-__global__ void CONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
-        {
-            int iw = uConstant->inputSize[0];
-            int ih = uConstant->inputSize[1];
-            int c = uConstant->channel;
-            int ow = uConstant->outputSize[0];
-            int oh = uConstant->outputSize[1];
-            int kw = uConstant->kernelSize[0];
-            int kh = uConstant->kernelSize[1];
-            int dw = uConstant->dilate[0];
-            int dh = uConstant->dilate[1];
-            int sw = uConstant->stride[0];
-            int sh = uConstant->stride[1];
-            int pw = uConstant->pad[0];
-            int ph = uConstant->pad[1];
-            int acttype = uConstant->activationType;
-
-            int oz = i / (ow * oh);
-            int tmp = i % (ow * oh);
-            int oy = tmp / ow;
-            int ox = tmp % ow;
-            int kz = oz % uConstant->subChannel;
-            
-            int ix = ox * sw - pw;
-            int iy = oy * sh - ph;
-            float color = 0.0;
-            if (bias != nullptr) {
-                color = bias[kz];
-            }
-
-            int fx, fy, fz;
-            for (fy=0; fy<kh; ++fy) {
-                int sy = fy*dh + iy;
-                if (sy >= ih || sy < 0) {
-                    continue;
-                }
-                for (fx=0; fx<kw; ++fx) {
-                    int sx = fx*dw + ix;
-                    if (sx >= iw || sx < 0) {
-                        continue;
-                    }
-                    float inputValue = input[0
-                        + sx
-                        + sy * iw
-                        + oz * iw * ih
-                    ];
-                    float k = kernel[0
-                        + fx
-                        + fy * kw
-                        + kz * kw * kh
-                    ];
-                    color  += k*inputValue;
-                }
-            }
-            color = (acttype==1) ? max(0.0, color) : (acttype==2 ? (min(max(0.0, color), 6.0)) : color);
-            output[0
-                + ox
-                + oy * ow
-                + oz * ow * oh
-            ] = color;
-        }
-    }
-    return;
-}
-
-
 ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     auto& prop = runtime->prop();
-    int threads_num = prop.maxThreadsPerBlock;
+    int limitThreads = UP_DIV(mTotalCount, prop.multiProcessorCount);
+    int threads_num = ALIMIN(prop.maxThreadsPerBlock, limitThreads);
     int block_num = prop.multiProcessorCount;
     auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-    if (inputs.size() == 1) {
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)mFilter,
-             (const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
-    } else if (inputs.size() == 3) {
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    } else {
-        MNN_ASSERT(inputs.size() == 2);
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if (inputs.size() == 1) {
+            CONV_DW_HALF<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
+                (const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
+        }
+        return NO_ERROR;
     }
-    return NO_ERROR;
-}
 
-
-
-__global__ void DECONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
-        {
-            int iw = uConstant->inputSize[0];
-            int ih = uConstant->inputSize[1];
-            int c = uConstant->channel;
-            int ow = uConstant->outputSize[0];
-            int oh = uConstant->outputSize[1];
-            int kw = uConstant->kernelSize[0];
-            int kh = uConstant->kernelSize[1];
-            int dw = uConstant->dilate[0];
-            int dh = uConstant->dilate[1];
-            int sw = uConstant->stride[0];
-            int sh = uConstant->stride[1];
-            int pw = uConstant->pad[0];
-            int ph = uConstant->pad[1];
-
-            int oz = i / (ow * oh);
-            int tmp = i % (ow * oh);
-            int oy = tmp / ow;
-            int ox = tmp % ow;
-            int kz = oz % uConstant->subChannel;
+    if (inputs.size() == 1) {
+        // block_num = runtime->blocks_num(mTotalCount);
+        // threads_num = runtime->threads_num();
+        if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
+            const int area = parameters.outputSize[0] * parameters.outputSize[1];
+            DivModFast d_owh(area);
+            DivModFast d_ow(parameters.outputSize[0]);
+            DivModFast d_ob(outputs[0]->batch());
             
-            int ix = ox + pw;
-            int iy = oy + ph;
-            float color = 0.0;
-            if (bias != nullptr) {
-                color = bias[kz];
-            }
-
-            int fx, fy, fz;
-            for (fy=0; fy<kh; ++fy) {
-                int sy = iy - fy*dh;
-                int y = sy / sh;
-                if (sy % sh == 0 && y >= 0 && y < ih) {
-                    for (int fx=0; fx<kw; ++fx) {
-                        int sx = ix - fx*dw;
-                        int x = sx / sw;
-                        if (sx % sw == 0 && x >= 0 && x < iw) {
-                            float inputValue = input[0
-                                + x
-                                + y * iw
-                                + oz * iw * ih
-                            ];
-                            float k = kernel[0
-                                + fx
-                                + fy * kw
-                                + kz * kw * kh
-                            ];
-                            color  += k*inputValue;                            
-                        }
-                    }
-                }
-            }
-            output[0
-                + ox
-                + oy * ow
-                + oz * ow * oh
-            ] = color;
+            CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
+                (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
+                d_owh, d_ow, d_ob);
+        } else {
+            CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
+                (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
         }
     }
-    return;
-}
-
-
-ErrorCode DeconvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto convCommon = mOp->main_as_Convolution2D()->common();
-    auto pad = ConvolutionCommon::convolutionTransposePad(inputs[0], outputs[0], convCommon);
-    constBuffer parameters;
-    parameters.pad[0] = pad.first;
-    parameters.pad[1] = pad.second;
-    parameters.kernelSize[0] = convCommon->kernelX();
-    parameters.kernelSize[1] = convCommon->kernelY();
-    parameters.stride[0] = convCommon->strideX();
-    parameters.stride[1] = convCommon->strideY();
-    parameters.dilate[0] = convCommon->dilateX();
-    parameters.dilate[1] = convCommon->dilateY();
-    parameters.inputSize[0] = inputs[0]->width();
-    parameters.inputSize[1] = inputs[0]->height();
-    parameters.channel = inputs[0]->batch() * inputs[0]->channel();
-    parameters.outputSize[0] = outputs[0]->width();
-    parameters.outputSize[1] = outputs[0]->height();
-    parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
-    parameters.subChannel = inputs[0]->channel();
-    auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    runtime->memcpy(constPtr, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
-    mTotalCount = parameters.total;
     return NO_ERROR;
 }
 
-ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    int block_num = runtime->blocks_num(mTotalCount);
-    int threads_num = runtime->threads_num();
-    auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-    if (inputs.size() > 2) {
-        DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    } else {
-        DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    }
-    return NO_ERROR;
-}
-
-
 class ConvDepthWiseExecutionCreator : public CUDABackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
-        if (OpType_ConvolutionDepthwise == op->type()) {
-            return new ConvDepthWiseExecution(op, backend);
-        }
-        if (inputs.size() == 1) {
-            MNN_PRINT("deconv depthwise not support 1 input yet\n");
+        if (inputs.size() > 1) {
             return nullptr;
         }
-        return new DeconvDepthWiseExecution(op, backend);
+        auto res = _makeResource(op, backend);
+        if (nullptr == res) {
+            return nullptr;
+        }
+        return new ConvDepthWiseExecution(op, backend, res);
     }
 };
 
 static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init(OpType_ConvolutionDepthwise);
-static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init2(OpType_DeconvolutionDepthwise);
 }
 }
\ No newline at end of file
diff --git a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
index 35ebcbb0..5bce3f72 100644
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
@@ -14,9 +14,30 @@
 #include "core/Execution.hpp"
 namespace MNN {
 namespace CUDA {
+
+struct constBuffer {
+    int pad[2];
+    int kernelSize[2];
+    int stride[2];
+    int dilate[2];
+    int inputSize[2];
+    int outputSize[2];
+    int channel;
+    int total;
+    int batch;
+    float minValue = -65504.0f;
+    float maxValue = 65504.0f;
+} uConstant;
+
 class ConvDepthWiseExecution : public Execution {
 public:
-    ConvDepthWiseExecution(const Op *op, Backend *bn);
+    struct Resource {
+        std::shared_ptr<Tensor> weightTensor;
+        std::shared_ptr<Tensor> biasTensor;
+        void* mFilter;
+        void* mBias;
+    };
+    ConvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource);
     virtual ~ConvDepthWiseExecution();
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@@ -25,17 +46,13 @@ protected:
     std::pair<void*, int> mConstBuffer;
     const Op *mOp;
     int mTotalCount;
-
-    void* mFilter;
-    void* mBias;
-    std::shared_ptr<Tensor> weightTensor;
-    std::shared_ptr<Tensor> biasTensor;
-    bool use_bias_=false;
+    constBuffer parameters;
+    std::shared_ptr<Resource> mResource;
 };
 
 class DeconvDepthWiseExecution : public ConvDepthWiseExecution {
 public:
-    DeconvDepthWiseExecution(const Op *op, Backend *bn) : ConvDepthWiseExecution(op, bn) {
+    DeconvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource) : ConvDepthWiseExecution(op, bn, resource) {
         // Do nothing
     }
     virtual ~DeconvDepthWiseExecution() {
diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.cu b/source/backend/cuda/execution/ConvSingleInputExecution.cu
index a022f0cb..1653e787 100644
--- a/source/backend/cuda/execution/ConvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/ConvSingleInputExecution.cu
@@ -7,55 +7,52 @@
 //
 
 #include "ConvSingleInputExecution.hpp"
+#include "Raster.cuh"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
 
+// 16 / sizeof(int4)
 namespace MNN {
 namespace CUDA {
 
-__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
-        const MatMulParam* matmulParam,
-        const float* A,
-        __half* AP) {
-    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
-    int lAlign = matmulParam->elhPack[1] * MATMULPACK;
-    int maxCount = eAlign * lAlign;
-    int kernelCount = param->kernelX * param->kernelY;
-    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int eIndex = index % eAlign;
-        int lIndex = index / eAlign;
-        // Compute for dest
-        int eU = eIndex / MATMULPACK;
-        int eR = eIndex % MATMULPACK;
-        int lU = lIndex / MATMULPACK;
-        int lR = lIndex % MATMULPACK;
-        auto dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lU * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR;
-        if (eIndex >= matmulParam->elh[0] || lIndex >= matmulParam->elh[1]) {
-            AP[dstOffset] = 0.0;
+__global__ void KernelReorder(const float* B, half* BP, int kw, int kh, int ic, int oc, int ocPack) {
+    int icC4 = UP_DIV(ic, PACK_NUMBER);
+    int kernelCount = kw * kh;
+    int l = icC4 * kernelCount * PACK_NUMBER;
+    int h = oc;
+    int lDiv = UP_DIV(l, MATMULPACK);
+    int lAlign = lDiv * MATMULPACK;
+    int hAlign = UP_DIV(h, ocPack) * ocPack;
+    int maxCount = hAlign * lAlign;
+
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int lR = indexO % MATMULPACK;
+        int tmp = indexO / MATMULPACK;
+        int hR = tmp % ocPack;
+        int tmp2 = tmp / ocPack;
+        int lC = tmp2 % lDiv;
+        int hC = tmp2 / lDiv;
+        half* dst = BP + indexO;
+        int sH = hC * ocPack + hR;
+        int sL = lC * MATMULPACK + lR;
+        if (sH >= oc) {
+            *dst = 0.0;
             continue;
         }
-        // Compute for source
-        int ox = eIndex % param->ow;
-        int oy = eIndex / param->ow;
-        int ob = oy / param->oh;
-        oy = oy % param->oh;
-        int sz = lIndex / kernelCount;
-        int kI = lIndex % kernelCount;
-        int ksx = kI % param->kernelX;
-        int ksy = kI / param->kernelX;
-
-        int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
-        int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
-        if (sx >= 0 && sx < param->iw) {
-            if (sy >=0 && sy < param->ih) {
-                __half value = A[sz * param->ih * param->iw + ob * param->iw * param->ih * param->icDiv4 + sy * param->iw + sx];
-                AP[dstOffset] = value;
-                continue;
-            }
+        int sLR = sL % PACK_NUMBER;
+        int sLC = sL / PACK_NUMBER;
+        int iLC = sLC / (kernelCount);
+        int ik = sLC % kernelCount;
+        int iz = iLC * PACK_NUMBER + sLR;
+        if (iz >= ic) {
+            *dst = 0.0;
+            continue;
         }
-        AP[dstOffset] = 0.0;
+        const float* src = B + sH * kernelCount * ic + ik + iz * kernelCount;
+        *dst = *src;
     }
 }
 
-
 ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     mBackend = bn;
     auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
@@ -78,40 +75,91 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
     ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
     mKernelInfo.kernelN = common->outputCount();
     mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
+    int icDiv = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER);
 
     MatMulParam param;
     int e = 0;
-    int l = mKernelInfo.kernelX * mKernelInfo.kernelY * mKernelInfo.kernelC;
+    int l = mKernelInfo.kernelX * mKernelInfo.kernelY * icDiv * MATMULPACK;
     int h = mKernelInfo.kernelN;
     param.elh[0] = e;
     param.elh[1] = l;
     param.elh[2] = h;
-    param.elhPack[0] = UP_DIV(e, 16);
-    param.elhPack[1] = UP_DIV(l, 16);
-    param.elhPack[2] = UP_DIV(h, 16);
+    param.elhPack[0] = UP_DIV(e, MATMULPACK);
+    param.elhPack[1] = UP_DIV(l, MATMULPACK);
+    param.elhPack[2] = UP_DIV(h, MATMULPACK);
     param.bStride[0] = 0;
     param.bStride[1] = 1;
     param.bStride[2] = l;
 
-    auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
-    auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
-    float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
-    runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
-    runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
+    FuseRegion reg;
+    int maxOffsetNumber = 8;
+    std::vector<int> offset(maxOffsetNumber);
+    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
+    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber);
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+
     // Reorder weight
-    weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
-    bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
-    mFilter = (void *)weightTensor.get()->buffer().device;
-    GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), nullptr, nullptr, cacheWeight, (__half*)mFilter);
-    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
-    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);
+    {
+        auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
+        float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
+        runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
+        weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
+        bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
+        mFilter = (void *)weightTensor.get()->buffer().device;
+        auto& prop = runtime->prop();
+        int cores = prop.multiProcessorCount;
+        int threadNumbers = prop.maxThreadsPerBlock;
+        if (param.elhPack[2] % 2 == 0) {
+            KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
+                    mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, 32);
+            mUsePack = true;
+        } else {
+            KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
+                    mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, MATMULPACK);
+        }
+        static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
+    }
 
     // Copy Bias
     int biasSize = conv->bias()->size();
     biasTensor.reset(Tensor::createDevice<float>({biasSize}));
     bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
+
+    auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
+    auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
+    cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+
+    // FP32 -> FP16
     mBias = (void *)biasTensor.get()->buffer().device;
-    cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+    int alignSize = UP_DIV(conv->bias()->size(), PACK_NUMBER) * PACK_NUMBER;
+    reg.size[0] = 1;
+    reg.size[1] = 1;
+    reg.size[2] = alignSize;
+    reg.srcStride[0] = 0;
+    reg.srcStride[1] = 0;
+    reg.srcStride[2] = 1;
+    reg.dstStride[0] = 0;
+    reg.dstStride[1] = 0;
+    reg.dstStride[2] = 1;
+    offset[0] = 1;
+    offset[1] = 1;
+    offset[2] = conv->bias()->size();
+    offset[3] = 0;
+    offset[4] = 1;
+    offset[5] = 1;
+    offset[6] = reg.size[2];
+    offset[7] = 0;
+    reg.fuseNumber = 1;
+    runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+    runtime->memcpy(offsetGpu, offset.data(), 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+    if (static_cast<CUDABackend*>(bn)->useFp16()) {
+        FuseRasterBlitFloatToHalf((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    } else {
+        FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4);
+    }
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
 }
 
 ConvSingleInputExecution::Resource::~Resource() {
@@ -146,14 +194,16 @@ bool ConvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** ds
 ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     auto input = inputs[0], output = outputs[0];
-    const int UNIT = 1;
+    const int UNIT = PACK_NUMBER;
     auto convCommon = mOp->main_as_Convolution2D()->common();
     auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common());
+    int ic = input->channel();
+    int icDiv = UP_DIV(ic, PACK_NUMBER);
     mIm2ColParamter.dilateX         = convCommon->dilateX();
     mIm2ColParamter.dilateY         = convCommon->dilateY();
     mIm2ColParamter.strideX         = convCommon->strideX();
     mIm2ColParamter.strideY         = convCommon->strideY();
-    mIm2ColParamter.icDiv4          = input->channel();
+    mIm2ColParamter.icDiv4          = icDiv;
     mIm2ColParamter.kernelX         = convCommon->kernelX();
     mIm2ColParamter.kernelY         = convCommon->kernelY();
     mIm2ColParamter.padX = std::get<0>(pads);
@@ -169,21 +219,21 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
 
     runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice);
 
+    //MNN_PRINT("conv size:%d-%d-%d, %d-%d-%d\n", input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
     int e = output->height() * output->width() * output->batch();
-    int l = input->channel() * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY;
+    int l = icDiv * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY * MATMULPACK;
     int h = output->channel();
     mMatMulParam.elh[0] = e;
     mMatMulParam.elh[1] = l;
     mMatMulParam.elh[2] = h;
-    mMatMulParam.elhPack[0] = UP_DIV(e, 16);
-    mMatMulParam.elhPack[1] = UP_DIV(l, 16);
-    mMatMulParam.elhPack[2] = UP_DIV(h, 16);
+    mMatMulParam.elhPack[0] = UP_DIV(e, MATMULPACK);
+    mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK);
+    mMatMulParam.elhPack[2] = UP_DIV(h, MATMULPACK);
     mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h;
     mMatMulParam.cStride[1] = 1;
     mMatMulParam.cStride[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
-    mMatMulParam.split[0] = 1;
-    mMatMulParam.split[1] = 1;
-    mMatMulParam.split[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
+    mMatMulParam.minValue = -FLT_MAX;
+    mMatMulParam.maxValue = FLT_MAX;
     if (convCommon->relu()) {
         mMatMulParam.minValue = 0.0f;
     }
@@ -191,12 +241,14 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
         mMatMulParam.minValue = 0.0f;
         mMatMulParam.maxValue = 6.0f;
     }
+    //MNN_PRINT("Im2Col temp size:%d!!!\n\n", mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
     runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);
 
     auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
-    auto buffer = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
+    auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)MATMULPACK * (size_t)MATMULPACK);
     mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second);
     pool->free(buffer);
+
     return NO_ERROR;
 }
 
@@ -204,21 +256,28 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
     //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
+    auto input = inputs[0];
+    auto output = outputs[0];
 
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
     const void *input_addr = (const void*)inputs[0]->deviceId();
     const void *filter_addr = mResource->mFilter;
     const void *bias_addr = mResource->mBias;
-
+    auto bn = backend();
     void *output_addr = (void*)outputs[0]->deviceId();
-    auto& prop = runtime->prop();
-    int threads_num = prop.maxThreadsPerBlock;
-    int cores = prop.multiProcessorCount;
+
     auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second);
     auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
-    //runtime->memset(mIm2ColBuffer, 0, mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * sizeof(__half) * (MATMULPACK * MATMULPACK));
-    Im2Col<<<cores, threads_num>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
-    GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const float*)bias_addr);
+    // Im2Col func
+    Im2ColMain(runtime, &mMatMulParam, gpuMatMul, &mIm2ColParamter, gpuIm2Col, (const float*)input_addr, mIm2ColBuffer, bytes);
+
+    if (mResource->mUsePack) {
+        GemmPacked16x32(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
+    } else {
+        //printf("NotPack:%d-%d-%d-%d-%d, %d-%d-%d\n", mIm2ColParamter.icDiv4, mIm2ColParamter.ih, mIm2ColParamter.iw, mIm2ColParamter.oh, mIm2ColParamter.ow, mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]);
+        GemmPackedFullMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
+    }
 
     return NO_ERROR;
 }
diff --git a/source/backend/cuda/execution/ConvSingleInputExecution.hpp b/source/backend/cuda/execution/ConvSingleInputExecution.hpp
index 2e70ce09..52c29aef 100644
--- a/source/backend/cuda/execution/ConvSingleInputExecution.hpp
+++ b/source/backend/cuda/execution/ConvSingleInputExecution.hpp
@@ -11,7 +11,9 @@
 
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
-#include "TensorCoreGemm.cuh"
+#include "TensorCoreGemmPacked.cuh"
+#include "ImageColumn.cuh"
+
 namespace MNN {
 namespace CUDA {
 
@@ -40,6 +42,7 @@ public:
         std::shared_ptr<Tensor> biasTensor;
         KernelInfo mKernelInfo;
         Backend* mBackend = nullptr;
+        bool mUsePack = false;
     };
     ConvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
     virtual ~ConvSingleInputExecution();
@@ -58,6 +61,7 @@ private:
     std::pair<void*, int> mGpuIm2ColParam;
 
     __half* mIm2ColBuffer;
+    std::pair<void*, int> mGpuKernelParam;
 };
 
 } // namespace CUDA
diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.cu b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
index 73d2f98b..c2d87d32 100644
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
@@ -11,263 +11,302 @@
 namespace MNN {
 namespace CUDA {
 
-template <typename T>
-__global__ void cutPad(const size_t size, const T* input, const int old_height,
-                    const int old_width, const int height, const int width, const int pad_top,
-                    const int pad_left, T* output) {
-    for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-        int block_num = pos / (width*height);
-        int left = pos % (width*height);
-        const int out_w = left % width;
-        const int out_h = left / width % height;
+__global__ void DeconvInputRerange(const int count,
+        const InputReorderParameter* param,
+        const float* Inp,
+        __half* InpRe
+        ) {
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+        int l = param->l_size;
+        int h = param->h_size;
+        int lIndex = i % l;
+        int hIndex = i / l;
+        int lU = lIndex / 16;
+        int lR = lIndex % 16;
+        int hU = hIndex / 16;
+        int hR = hIndex % 16;
 
-        output[pos] = input[(block_num * old_height + out_h + pad_top) * old_width + out_w + pad_left];
+        int bIndex = hIndex / param->hw_size;
+        int hwIndex = hIndex % param->hw_size;
+
+        float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex];
+        //inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value;
+
+        //__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16;
+        __half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16;
+        dst[0] = value;
     }
-    return;
 }
 
-DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op) : Execution(backend), mOp(op) {
-    //MNN_PRINT("cuda DeconvSingleInput onInit in\n");
+template <typename Dtype>
+__global__ void Col2Im(const int n, const Dtype* data_col,
+    const int batch, const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    const Dtype* bias, Dtype* data_im) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) {
+        Dtype val = 0;
+        const int b_im = index / (channels * width * height);
+        const int chw  = index % (channels * width * height);
+        const int w_im = chw % width + pad_w;
+        const int h_im = (chw / width) % height + pad_h;
+        const int c_im = chw / (width * height);
+        int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+        int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+        // compute the start and end of the output
+        const int w_col_start =
+            (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+        const int w_col_end = min(w_im / stride_w + 1, width_col);
+        const int h_col_start =
+            (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+        const int h_col_end = min(h_im / stride_h + 1, height_col);
+        // TODO: use LCM of stride and dilation to avoid unnecessary loops
+        for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+            for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+                int h_k = (h_im - h_col * stride_h);
+                int w_k = (w_im - w_col * stride_w);
+                if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+                    h_k /= dilation_h;
+                    w_k /= dilation_w;
+                    int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) *
+                                            height_col + h_col) * width_col + w_col;
+                    val += data_col[data_col_index];
+                }
+            }
+        }
+
+        if(nullptr != bias) {
+            val += bias[c_im];
+        }
+        data_im[index] = val;
+    }
+}
+
+
+DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
+    mBackend = bn;
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+    
     auto conv       = op->main_as_Convolution2D();
     auto common     = conv->common();
-
-    mKernelInfo.groups         = common->group();
     mKernelInfo.kernelX        = common->kernelX();
     mKernelInfo.kernelY        = common->kernelY();
-    mKernelInfo.padMode        = common->padMode();
-    mKernelInfo.padX           = common->padX();
-    mKernelInfo.padY           = common->padY();
-
-    if (nullptr != common->pads()) {
-        mKernelInfo.padX = common->pads()->data()[1];
-        mKernelInfo.padY = common->pads()->data()[0];
-    }
-    pad_left_  = mKernelInfo.padX;
-    pad_right_ = mKernelInfo.padX;
-    pad_top_ = mKernelInfo.padY;
-    pad_bottom_ = mKernelInfo.padY;
-
+    mKernelInfo.groups         = common->group();
     mKernelInfo.strideX        = common->strideX();
     mKernelInfo.strideY        = common->strideY();
     mKernelInfo.dilateX        = common->dilateX();
     mKernelInfo.dilateY        = common->dilateY();
     mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);
 
-    use_relu_ = (mKernelInfo.activationType == 1);
-    use_relu6_ = (mKernelInfo.activationType == 2);
-
-    cudnn_handle_ = nullptr;
-    input_desc_ = nullptr;
-    output_desc_ = nullptr;
-    filter_desc_ = nullptr;
-    conv_desc_ = nullptr;
-    padded_desc_ = nullptr;
-    cudnn_data_type_ = CUDNN_DATA_FLOAT;
-    cudnn_data_type_len_ = 0;
-
-    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
-    cudnn_handle_ = runtime->cudnn_handle();
-    cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&bias_desc_));
-    cudnn_check(cudnnCreateFilterDescriptor(&filter_desc_));
-    cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc_));
-    cudnn_check(cudnnCreateActivationDescriptor(&act_desc_));
-
-
     //weight host->device
     const float* filterDataPtr = nullptr;
     int weightSize = 0;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
-    weightTensor.reset(Tensor::createDevice<float>({weightSize}));
-    backend->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
+    mKernelInfo.kernelN = common->outputCount();
+    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
+
+    MatMulParam param;
+    int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY;
+    int l = mKernelInfo.kernelC;
+    int h = 0;
+    param.elh[0] = e;
+    param.elh[1] = l;
+    param.elh[2] = h;
+    param.elhPack[0] = UP_DIV(e, 16);
+    param.elhPack[1] = UP_DIV(l, 16);
+    param.elhPack[2] = UP_DIV(h, 16);
+
+    param.aStride[0] = 1;
+    param.aStride[1] = e;
+    param.aStride[2] = 0;
+
+    auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
+    auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
+    float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
+    runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
+    runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
+    
+    // Reorder weight
+    weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)}));
+    bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
     mFilter = (void *)weightTensor.get()->buffer().device;
-    cuda_check(cudaMemcpy(mFilter, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
+    GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);
 
+    // Copy Bias
+    int biasSize = conv->bias()->size();
+    biasTensor.reset(Tensor::createDevice<float>({biasSize}));
+    bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
+    mBias = (void *)biasTensor.get()->buffer().device;
+    cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+    
+}
 
-    if(conv->bias()->size() != 0) {
-        int biasSize = conv->bias()->size();
-        biasTensor.reset(Tensor::createDevice<float>({biasSize}));
-        backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-        mBias = (void *)biasTensor.get()->buffer().device;
-
-        cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-        
-        int bias_size = conv->bias()->size();
-        int dim_bias[] = {1, bias_size, 1, 1};
-        int stride_bias[] = {bias_size, 1, 1, 1};
-        if(cudnn_data_type_ == CUDNN_DATA_FLOAT) {
-            cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias));
-        }
-        else if(cudnn_data_type_ == CUDNN_DATA_HALF) {
-            cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_HALF, 4, dim_bias, stride_bias));
-        } else {
-            MNN_PRINT("only supports fp32/fp16 data type!!!\n");
-        }
-        use_bias_ = true;
-    }
+DeconvSingleInputExecution::Resource::~Resource() {
+    // Do nothing
+}
+DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : Execution(backend), mOp(op) {
+    mResource = res;
+    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam));
+    mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter));
+    mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter));
 }
 
 DeconvSingleInputExecution::~DeconvSingleInputExecution() {
-    cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
-    cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(output_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(input_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(bias_desc_));
-    cudnn_check(cudnnDestroyActivationDescriptor(act_desc_));
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mGpuMatMulParam);
+    staticPool->free(mGpuCol2ImParam);
+    staticPool->free(mGpuInpReorderParam);
+}
+bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (!mValid) {
+        return false;
+    }
+    if (nullptr == dst) {
+        return true;
+    }
+    auto dstExe = new DeconvSingleInputExecution(bn, op, mResource);
+    *dst = dstExe;
+    return true;
 }
 
+
 ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
-    // prepare
-    //MNN_PRINT("cuda DeconvSingleInput onResize in, pad:%d\n", mKernelInfo.padX);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     auto input = inputs[0], output = outputs[0];
+    const int UNIT = 1;
+    auto convCommon = mOp->main_as_Convolution2D()->common();
 
-    mIOInfo.iw = input->width();
-    mIOInfo.ih = input->height();
-    mIOInfo.ic = input->channel();
-    mIOInfo.ib = input->batch();
-    
-    mIOInfo.ow = output->width();
-    mIOInfo.oh = output->height();
-    mIOInfo.oc = output->channel();
-    mIOInfo.ob = output->batch();
+    // Input Rerange Param
+    mInpReorderParameter.hw_size = input->height() * input->width();
+    mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size;
+    mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel();
+    mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride;
+    mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size;
+    mInpReorderParameter.l_size    = input->channel();
+    mInpReorderParameter.h_size    = input->batch() * mInpReorderParameter.hw_size;
+    mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16);
+    mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16);
 
-    mKernelInfo.kernelN = output->channel();
-    mKernelInfo.kernelC = input->channel() / mKernelInfo.groups;
+    runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice);
 
-    std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
-    std::vector<int> output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
-    std::vector<int> filter_shape = {mKernelInfo.kernelC, mKernelInfo.kernelN, mKernelInfo.kernelY, mKernelInfo.kernelX};//deconv (ic oc kh kw)
-    
-    // printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]);
-    // printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]);
-    // printf("output:%d %d %d %d\n", output_shape[0], output_shape[1], output_shape[2], output_shape[3]);
-    cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0],
-                                in_shape[1], in_shape[2], in_shape[3]));
- 
-    cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
-                                filter_shape[1], filter_shape[2], filter_shape[3]));
-    cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0],
-                                output_shape[1], output_shape[2], output_shape[3]));
+    // Col2Im Param
+    auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());
+    mCol2ImParamter.dilateX         = convCommon->dilateX();
+    mCol2ImParamter.dilateY         = convCommon->dilateY();
+    mCol2ImParamter.strideX         = convCommon->strideX();
+    mCol2ImParamter.strideY         = convCommon->strideY();
+    mCol2ImParamter.ic              = input->channel();
+    mCol2ImParamter.oc              = output->channel();
+    mCol2ImParamter.kernelX         = convCommon->kernelX();
+    mCol2ImParamter.kernelY         = convCommon->kernelY();
+    mCol2ImParamter.padX = pad.first;
+    mCol2ImParamter.padY = pad.second;
+
+    mCol2ImParamter.ih = input->height();
+    mCol2ImParamter.iw = input->width();
+    mCol2ImParamter.oh = output->height();
+    mCol2ImParamter.ow = output->width();
+    mCol2ImParamter.ob = output->batch();
+
+    runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice);
+
+    // Matmul Param
+    int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY;
+    int l = input->channel();
+    int h = input->height() * input->width() * output->batch();
+
+    mMatMulParam.elh[0] = e;
+    mMatMulParam.elh[1] = l;
+    mMatMulParam.elh[2] = h;
+    mMatMulParam.elhPack[0] = UP_DIV(e, 16);
+    mMatMulParam.elhPack[1] = UP_DIV(l, 16);
+    mMatMulParam.elhPack[2] = UP_DIV(h, 16);
+
+    mMatMulParam.bStride[0] = 0;
+    mMatMulParam.bStride[1] = input->height() * input->width();
+    mMatMulParam.bStride[2] = 1;
+
+    mMatMulParam.cStride[0] = h;
+    mMatMulParam.cStride[1] = 1;
+    mMatMulParam.cStride[2] = 1;
+    if (convCommon->relu()) {
+        mMatMulParam.minValue = 0.0f;
+    }
+    if (convCommon->relu6()) {
+        mMatMulParam.minValue = 0.0f;
+        mMatMulParam.maxValue = 6.0f;
+    }
+    runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);
 
     
+    // Alloc temp cuda memory
+    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
+    auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]);
+    auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK);
 
-    cudnnTensorDescriptor_t input_descriptor_real = nullptr;
+    mIm2ColBuffer = (float*)((uint8_t*)buffer1.first + buffer1.second);
+    mInputBuffer = (__half*)((uint8_t*)buffer2.first + buffer2.second);
 
-    if (mKernelInfo.padMode == PadMode_SAME) {
-        int kernelWidthSize = (mKernelInfo.kernelX - 1) * mKernelInfo.dilateX + 1;
-        int kernelHeightSize = (mKernelInfo.kernelY - 1) * mKernelInfo.dilateY + 1;
-        int pw = (mIOInfo.iw - 1) * mKernelInfo.strideX + kernelWidthSize - mIOInfo.ow;
-        int ph = (mIOInfo.ih - 1) * mKernelInfo.strideY + kernelHeightSize - mIOInfo.oh;
-        pad_left_  = pw/2;
-        pad_right_ = pw - pad_left_;
-        pad_top_ = ph/2;
-        pad_bottom_ = ph - pad_top_;
-    }
+    pool->free(buffer2);
+    pool->free(buffer1);
 
-    use_pad_ = (pad_left_!=0 || pad_right_!=0 || pad_top_!=0 || pad_bottom_!=0 ) ? true : false;
-
-    if(use_pad_) {
-        int totalSize = output_shape[0]*output_shape[1]*(output_shape[2]+pad_top_+pad_bottom_)*(output_shape[3]+pad_left_+pad_right_);
-        padTensor.reset(Tensor::createDevice<float>({totalSize}));
-        backend()->onAcquireBuffer(padTensor.get(), Backend::DYNAMIC);
-        mPadPtr = (void *)padTensor.get()->buffer().device;
-
-        //dynamic memory release
-        backend()->onReleaseBuffer(padTensor.get(), Backend::DYNAMIC);
-
-        cudnn_check(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0], output_shape[1],
-            output_shape[2] + +pad_top_+pad_bottom_, output_shape[3] + pad_left_+pad_right_));
-    }
-    input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
-
-    cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX, 
-                                mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
-    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
-        cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
-    }
-    //set group num
-    cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
-    
-    // algorithm
-    constexpr int requested_algo_count = 1;
-    int returned_algo_count;
-    cudnnConvolutionBwdDataAlgoPerf_t perf_results;
-    cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_,
-        output_desc_,  requested_algo_count, &returned_algo_count, &perf_results));
-    conv_bwd_algo_ = perf_results.algo;
-
-    // workspace
-    cudnn_check(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_, output_desc_,
-        conv_bwd_algo_, &workspace_size_));
-
-    if (workspace_size_ != 0) {
-        int workspaceSize = workspace_size_;
-        workspaceTensor.reset(Tensor::createDevice<float>({workspaceSize}));
-        //cudnn not support workspace memory reuse
-        backend()->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-        mWorkSpace = (void *)workspaceTensor.get()->buffer().device;
-    }
-
-    if(use_relu_) {
-        cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-    } else if(use_relu6_) {
-        cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_NOT_PROPAGATE_NAN, 6.0));
-    } else {
-        //do nothing
-    }
-    //MNN_PRINT("cuda DeconvSingleInput onResize out\n");
     return NO_ERROR;
 }
 
 ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
-    //MNN_PRINT("cuda DeconvSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
-
+    //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
 
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     const void *input_addr = (const void*)inputs[0]->deviceId();
-    const void *filter_addr = mFilter;
-    const void *bias_addr = mBias;
-
+    const void *filter_addr = mResource->mFilter;
+    const void *bias_addr = mResource->mBias;
     void *output_addr = (void*)outputs[0]->deviceId();
-    void *workspace_addr = nullptr;
-    if (workspace_size_ != 0) {
-        workspace_addr = mWorkSpace;
-    }
 
-    const float alpha = 1;
-    const float beta = 0;
+    auto gpuInpReorder = (const InputReorderParameter*)((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second);
+    auto gpuCol2Im = (const Col2ImParameter*)((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second);
+    auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
 
+    const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch();
+    int inp_block_num = runtime->blocks_num(rerangeCount);
+    int inp_thread_num = runtime->threads_num();
 
-    if(use_pad_) {
-        cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
-            conv_bwd_algo_, workspace_addr, workspace_size_, &beta, padded_desc_, mPadPtr));
+    // Do input Rerange
+    runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half));
+    DeconvInputRerange<<<inp_block_num, inp_thread_num>>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer);
 
-        std::vector<int> out_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
+    // Do Gemm operation 
+    GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)mIm2ColBuffer, (const half*)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false);
 
-        int size = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3];
-        int block_num = runtime->blocks_num(size);
-        int threads_num = runtime->threads_num();
+    // Do Col2Im trans
+    int height_col = mCol2ImParamter.ih;
+    int width_col = mCol2ImParamter.iw;
+    int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow;
 
-        cutPad<<<block_num, threads_num>>>(size, (float*)mPadPtr, out_shape[2]+pad_top_+pad_bottom_, out_shape[3]+pad_left_+pad_right_,
-            out_shape[2], out_shape[3], pad_top_, pad_left_, (float*)output_addr);
-    }
-    else {
-        cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
-            conv_bwd_algo_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr));
-    }
+    int col2im_block_num = runtime->blocks_num(num_kernels);
+    int col2im_thread_num = runtime->threads_num();
+
+    // printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \
+    //     mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \
+    //     mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \
+    //     col2im_block_num, col2im_thread_num);
+    
+    Col2Im<float><<<col2im_block_num, col2im_thread_num>>>(
+        num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, 
+        mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX, 
+        mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX,
+        height_col, width_col, (const float*)bias_addr, (float *)output_addr);
 
-    if(use_bias_) {
-        cudnn_check(cudnnAddTensor(cudnn_handle_, &alpha, bias_desc_, bias_addr, &alpha, output_desc_, output_addr));
-    }
-    if(use_relu_ || use_relu6_) {
-        cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr));
-    }
     return NO_ERROR;
 }
 
@@ -287,7 +326,8 @@ public:
             MNN_PRINT("Deconv inputs size:3 not support\n");
             return nullptr;
         } else if(inputs.size() == 1) {
-            return new DeconvSingleInputExecution(backend, op);
+            std::shared_ptr<DeconvSingleInputExecution::Resource> resource(new DeconvSingleInputExecution::Resource(backend, op));
+            return new DeconvSingleInputExecution(backend, op, resource);
         } else {
             MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size());
             return nullptr;
@@ -295,7 +335,7 @@ public:
     }
 };
 
-CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);
+//CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);
 
 }// namespace CUDA
 }// namespace MNN
diff --git a/source/backend/cuda/execution/DeconvSingleInputExecution.hpp b/source/backend/cuda/execution/DeconvSingleInputExecution.hpp
index f20ef02f..dec1b951 100644
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.hpp
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.hpp
@@ -11,7 +11,7 @@
 
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
-#include "half.hpp"
+#include "TensorCoreGemm.cuh"
 
 namespace MNN {
 namespace CUDA {
@@ -26,9 +26,6 @@ struct KernelInfo {
     int kernelC        = 0;
     int kernelX        = 0;
     int kernelY        = 0;
-    PadMode padMode    = PadMode_CAFFE;
-    int padX           = 0;
-    int padY           = 0;
     int strideX        = 0;
     int strideY        = 0;
     int dilateX        = 0;
@@ -36,59 +33,71 @@ struct KernelInfo {
     int activationType = 0;
 };//
 
+struct Col2ImParameter {
+    int padX;
+    int padY;
+    int dilateX;
+    int dilateY;
+    int strideX;
+    int strideY;
+    int kernelX;
+    int kernelY;
+    int oc;
+    int ic;
+    int iw;
+    int ih;
+    int ow;
+    int oh;
+    int ob;
+};
+
+struct InputReorderParameter {
+    int ic_stride;
+    int ib_stride;
+    int oc_stride;
+    int ob_stride;
+    int hw_size;
+    int l_size;
+    int h_size;
+    int lpack_size;
+    int hpack_size;
+}; 
+
+
 extern "C"
 class DeconvSingleInputExecution : public Execution {
 public:
-    DeconvSingleInputExecution(Backend* backend, const MNN::Op* op);
+    struct Resource {
+        Resource(Backend* bn, const MNN::Op* op);
+        ~ Resource();
+        void* mFilter;
+        void* mBias;
+        std::shared_ptr<Tensor> weightTensor;
+        std::shared_ptr<Tensor> biasTensor;
+        KernelInfo mKernelInfo;
+        Backend* mBackend = nullptr;
+    };
+    DeconvSingleInputExecution(Backend* backend, const MNN::Op* op,  std::shared_ptr<Resource> res);
     virtual ~DeconvSingleInputExecution();
     virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
 
 private:
-    cudnnHandle_t cudnn_handle_;
-    cudnnTensorDescriptor_t input_desc_;
-    cudnnTensorDescriptor_t output_desc_;
-    cudnnFilterDescriptor_t filter_desc_;
-    cudnnConvolutionBwdDataAlgo_t conv_bwd_algo_;
-    cudnnConvolutionDescriptor_t conv_desc_;
-    cudnnTensorDescriptor_t bias_desc_;
-    cudnnTensorDescriptor_t padded_desc_;
-    cudnnActivationDescriptor_t act_desc_;
+    std::shared_ptr<Resource> mResource;
 
-    cudnnDataType_t cudnn_data_type_;
-    int cudnn_data_type_len_;
-    bool use_pad_ = false;
-    int pad_top_ = 0;
-    int pad_bottom_ = 0;
-    int pad_left_ = 0;
-    int pad_right_ = 0;
+    const Op* mOp = nullptr;
+    MatMulParam mMatMulParam;
+    std::pair<void*, int> mGpuMatMulParam;
 
-    bool use_bias_ = false;
-    bool use_relu_ = false;
-    bool use_relu6_ = false;
+    Col2ImParameter mCol2ImParamter;
+    std::pair<void*, int> mGpuCol2ImParam;
 
-    void* mPadPtr;
-    void* mFilter;
-    void* mBias;
-    void* mWorkSpace;
-    std::shared_ptr<Tensor> weightTensor;
-    std::shared_ptr<Tensor> biasTensor;
-    std::shared_ptr<Tensor> padTensor;
-    std::shared_ptr<Tensor> workspaceTensor;
+    InputReorderParameter mInpReorderParameter;
+    std::pair<void*, int> mGpuInpReorderParam;
 
-    std::shared_ptr<Tensor> mPad;
-    std::shared_ptr<Tensor> mWorkspaceForward;
-
-    size_t input_size_;
-    size_t filter_size_;
-    size_t output_size_;
-    size_t padded_size_;
-    size_t workspace_size_;
-
-    const MNN::Op* mOp;
-    KernelInfo mKernelInfo;
-    IOInfo mIOInfo;
-    std::shared_ptr<Tensor> mTempInput;
+    float* mIm2ColBuffer;
+    __half* mInputBuffer;
 };
 
 } // namespace CUDA
diff --git a/source/backend/cuda/execution/ImageColumn.cu b/source/backend/cuda/execution/ImageColumn.cu
new file mode 100644
index 00000000..a50b22bc
--- /dev/null
+++ b/source/backend/cuda/execution/ImageColumn.cu
@@ -0,0 +1,705 @@
+#include "ImageColumn.cuh"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+#include "Raster.cuh"
+
+#define BLOCK_INT4 2
+
+namespace MNN {
+namespace CUDA {
+
+__global__ void Im2Col1x1(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const float* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+    int lAlign = matmulParam->elhPack[1];
+    int maxCount = eAlign * lAlign * BLOCK_INT4;
+    int kernelCount = 1;
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO >> 1;
+        int lR = indexO & 1;
+        int eIndex, lIndex;
+        eAlignD.divmod(index, lIndex, eIndex);
+        int eU = eIndex >> 4;
+        int eR = eIndex & 15;
+        int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+        int4* dst = (int4*)(AP + dstOffset);
+        if (eIndex >= matmulParam->elh[0]) {
+            *dst = {0, 0, 0, 0};
+            continue;
+        }
+        // Compute for source
+        int ox, oy, ob;
+        owD.divmod(eIndex, oy, ox);
+        ohD.divmod(oy, ob, oy);
+        int sz = lIndex;
+        int sx = ox * param->strideX - param->padX;
+        int sy = oy * param->strideY - param->padY;
+        if (sx >= 0 && sx < param->iw) {
+            if (sy >=0 && sy < param->ih) {
+                int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+                float2* srcF = (float2*)(A + offset);
+                half2* dstH = (half2*)dst;
+                dstH[0] = __float22half2_rn(srcF[0]);
+                dstH[1] = __float22half2_rn(srcF[1]);
+                dstH[2] = __float22half2_rn(srcF[2]);
+                dstH[3] = __float22half2_rn(srcF[3]);
+                continue;
+            }
+        }
+        *dst = {0, 0, 0, 0};
+    }
+}
+
+__global__ void Im2Col1x1_OPT(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const int maxCount, 
+    const float* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO >> 3;
+        int lR = indexO & 7;
+        int eIndex, lIndex;
+        eAlignD.divmod(index, lIndex, eIndex);
+        int eU = eIndex >> 4;
+        int eR = eIndex & 15;
+        int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
+
+        int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
+        float2* srcF = (float2*)(A + offset);
+        half2* dstH = (half2*)(AP + dstOffset);
+        dstH[0] = __float22half2_rn(srcF[0]);
+    }
+}
+
+__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const float* A,
+    half* AP) {
+    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+    int lAlign = matmulParam->elhPack[1];
+    int maxCount = eAlign * lAlign * BLOCK_INT4;
+    int kernelCount = param->kernelX * param->kernelY;
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO / BLOCK_INT4;
+        int lR = indexO % BLOCK_INT4;
+        int eIndex = index % eAlign;
+        int lIndex = index / eAlign;
+        int eU = eIndex / MATMULPACK;
+        int eR = eIndex % MATMULPACK;
+        int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+        int4* dst = (int4*)(AP + dstOffset);
+        if (eIndex >= matmulParam->elh[0]) {
+            *dst = {0, 0, 0, 0};
+            continue;
+        }
+        // Compute for source
+        int ox = eIndex % param->ow;
+        int oy = eIndex / param->ow;
+        int ob = oy / param->oh;
+        oy = oy % param->oh;
+        int sz = lIndex / kernelCount;
+        int kI = lIndex % kernelCount;
+        int ksx = kI % param->kernelX;
+        int ksy = kI / param->kernelX;
+
+        int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+        int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+        if (sx >= 0 && sx < param->iw) {
+            if (sy >=0 && sy < param->ih) {
+                int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+                float2* srcF = (float2*)(A + offset);
+                half2* dstH = (half2*)dst;
+                dstH[0] = __float22half2_rn(srcF[0]);
+                dstH[1] = __float22half2_rn(srcF[1]);
+                dstH[2] = __float22half2_rn(srcF[2]);
+                dstH[3] = __float22half2_rn(srcF[3]);
+                continue;
+            }
+        }
+        *dst = {0, 0, 0, 0};
+    }
+}
+
+__global__ void Im2Col1x1_half(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const half* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+int lAlign = matmulParam->elhPack[1];
+int maxCount = eAlign * lAlign * BLOCK_INT4;
+int kernelCount = 1;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    int index = indexO / BLOCK_INT4;
+    int lR = indexO % BLOCK_INT4;
+    int eIndex, lIndex;
+    eAlignD.divmod(index, lIndex, eIndex);
+    int eU = eIndex / MATMULPACK;
+    int eR = eIndex % MATMULPACK;
+    int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+    int4* dst = (int4*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0, 0, 0};
+        continue;
+    }
+    // Compute for source
+    int ox, oy, ob;
+    owD.divmod(eIndex, oy, ox);
+    ohD.divmod(oy, ob, oy);
+    int sz = lIndex;
+    int sx = ox * param->strideX - param->padX;
+    int sy = oy * param->strideY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+            int4* src = (int4*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0, 0, 0};
+}
+}
+
+__global__ void Im2Col1x1_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
+const MatMulParam* matmulParam,
+const int maxCount, 
+const half* A,
+half* AP,
+DivModFast eAlignD,
+DivModFast owD,
+DivModFast ohD
+) {
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    int index = indexO >> 3;
+    int lR = indexO & 7;
+    int eIndex, lIndex;
+    eAlignD.divmod(index, lIndex, eIndex);
+    int eU = eIndex >> 4;
+    int eR = eIndex & 15;
+    int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
+
+    int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
+    int* srcF = (int*)(A + offset);
+    int* dstH = (int*)(AP + dstOffset);
+    dstH[0] = srcF[0];
+}
+}
+
+__global__ void Im2Col_half(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const int maxCount,
+    const half* A,
+    half* AP,
+    DivModFast d_eA,
+    DivModFast d_ow,
+    DivModFast d_oh,
+    DivModFast d_fxy,
+    DivModFast d_fx
+    ) {
+int eAlign = matmulParam->elhPack[0] << 4;
+int lAlign = matmulParam->elhPack[1];
+int kernelCount = param->kernelX * param->kernelY;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    size_t index = indexO >> 1;
+    size_t lR = indexO & 1;
+    int eIndex, lIndex;
+    d_eA.divmod(index, lIndex, eIndex);
+    size_t eU = eIndex >> 4;
+    size_t eR = eIndex & 15;
+    size_t dstOffset = ((((eU * matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 3);
+    int4* dst = (int4*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0, 0, 0};
+        continue;
+    }
+    // Compute for source
+    int ox, oby, ob, oy, sz, kI, ksx, ksy;
+    d_ow.divmod(eIndex, oby, ox);
+    d_oh.divmod(oby, ob, oy);
+    d_fxy.divmod(lIndex, sz, kI);
+    d_fx.divmod(kI, ksy, ksx);
+
+    size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+    size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + lR * 8;
+            int4* src = (int4*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0, 0, 0};
+}
+}
+
+__global__ void Im2Col_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const size_t maxCount,
+    const half* A,
+    half* AP,
+    DivModFast d_eA,
+    DivModFast d_ow,
+    DivModFast d_oh,
+    DivModFast d_fxy,
+    DivModFast d_fx
+) {
+size_t eAlign = matmulParam->elhPack[0] << 4;
+size_t lAlign = matmulParam->elhPack[1];
+size_t kernelCount = param->kernelX * param->kernelY;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    size_t index = indexO >> 2;
+    size_t lR = indexO & 3;
+    int eIndex, lIndex;
+    d_eA.divmod(index, lIndex, eIndex);
+    size_t eU = eIndex >> 4;
+    size_t eR = eIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 2);
+    int2* dst = (int2*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0};
+        continue;
+    }
+
+    // Compute for source
+    int ox, oby, ob, oy, sz, kI, ksx, ksy;
+    d_ow.divmod(eIndex, oby, ox);
+    d_oh.divmod(oby, ob, oy);
+    d_fxy.divmod(lIndex, sz, kI);
+    d_fx.divmod(kI, ksy, ksx);
+
+    size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+    size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + (lR << 2);
+            int2* src = (int2*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0};
+}
+}
+
+
+__global__ void Im2Col_half_3x3S1D1P1_OPT2(const ConvolutionCommon::Im2ColParameter* param,
+const MatMulParam* matmulParam,
+const size_t maxCount,
+const half* A,
+half* AP,
+DivModFast d_eA,
+DivModFast d_ow,
+DivModFast d_oh
+) {
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+size_t index = indexO >> 3;
+size_t lR = indexO & 7;
+int eIndex, lIndex;
+d_eA.divmod(index, lIndex, eIndex);
+
+int ix, oby, ob, iy;
+d_ow.divmod(eIndex, oby, ix);
+d_oh.divmod(oby, ob, iy);
+size_t sz = lIndex;
+
+size_t offset = sz * param->srcZStep + (((ob * param->ih + iy) * param->iw + ix) << 4) + (lR << 1);
+int src = *((int*)(A + offset));
+
+// Pixel (iy-1, ix-1)
+if(iy-1 >=0 && ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 8) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy-1, ix+0)
+if(iy-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 7) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy-1, ix+1)
+if(iy-1 >=0 && ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 6) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy+0, ix-1)
+if(ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 5) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy, ix)
+if(1) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 4) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy, ix+1)
+if(ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 3) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy+1, ix-1)
+if(iy+1 < param->ih && ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 2) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }  
+}
+
+// Pixel (iy+1, ix)
+if(iy+1 < param->ih) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 1) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+//Pixel (iy+1, ix+1)
+if(iy+1 < param->ih && ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 0) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+}
+}
+
+
+
+void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col,\
+     const void* input_addr, __half* mIm2ColBuffer, int bytes) {
+
+    size_t eAlign = cpuMatlMul->elhPack[0] * MATMULPACK;
+    size_t lAlign = cpuMatlMul->elhPack[1];
+
+    DivModFast eAlignD(eAlign);
+    DivModFast owD(cpuIm2Col->ow);
+    DivModFast ohD(cpuIm2Col->oh);
+
+    if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1 && \
+        cpuMatlMul->elh[0] % 16 == 0 && \
+        cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
+        cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
+        cpuIm2Col->padX == 0 && cpuIm2Col->padY == 0) {
+
+        size_t maxCount = eAlign * lAlign * 8;//Align 2
+        int block_num = runtime->blocks_num(maxCount);
+        int block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col1x1_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
+                    (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        } else {
+            Im2Col1x1_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
+                    (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        }
+    } else if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1) {
+        size_t maxCount = eAlign * lAlign * 2;//Align 8
+        int block_num = runtime->blocks_num(maxCount);
+        int block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col1x1<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        } else {
+            Im2Col1x1_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        }
+    } else if(cpuIm2Col->kernelX == 3 && cpuIm2Col->kernelY == 3 && \
+        cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
+        cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
+        cpuIm2Col->padX == 1 && cpuIm2Col->padY == 1 && \
+        bytes == 2) {
+        
+        size_t maxCount = eAlign * (lAlign / 9) * 8;
+        size_t block_num = runtime->blocks_num(maxCount);
+        size_t block_size = runtime->threads_num();
+
+        //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
+        Im2Col_half_3x3S1D1P1_OPT2<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,\
+            eAlignD, owD, ohD);
+        checkKernelErrors;
+    } else {
+        size_t maxCount = eAlign * lAlign * 2;
+        size_t block_num = runtime->blocks_num(maxCount);
+        size_t block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
+            checkKernelErrors;
+        } else {
+
+            DivModFast fxyD((cpuIm2Col->kernelX*cpuIm2Col->kernelY));
+            DivModFast fxD(cpuIm2Col->kernelX);
+            maxCount = eAlign * lAlign * 4;
+            block_num = runtime->blocks_num(maxCount);
+            block_size = runtime->threads_num();
+
+            //Im2Col_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD, fxyD, fxD);
+
+            Im2Col_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,
+                eAlignD, owD, ohD, fxyD, fxD);
+            checkKernelErrors;
+        }
+    }
+}
+
+} // namespace CUDA
+} // namespace MNN
\ No newline at end of file
diff --git a/source/backend/cuda/execution/ImageColumn.cuh b/source/backend/cuda/execution/ImageColumn.cuh
new file mode 100644
index 00000000..ec44a1b6
--- /dev/null
+++ b/source/backend/cuda/execution/ImageColumn.cuh
@@ -0,0 +1,24 @@
+//
+//  ImageColumn.cuh
+//  MNN
+//
+//  Created by MNN on 2021/01/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef IMAGE_COLUMN_CUH
+#define IMAGE_COLUMN_CUH
+
+#include "backend/cuda/core/runtime/CUDARuntime.hpp"
+#include "TensorCoreGemm.cuh"
+#include "backend/cuda/core/CUDABackend.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col, const void* input_addr, __half* mIm2ColBuffer, int bytes);
+
+} // namespace CUDA
+} // namespace MNN
+#endif
+
diff --git a/source/backend/cuda/execution/InterpExecution.cu b/source/backend/cuda/execution/InterpExecution.cu
index 11396309..7202b1f5 100644
--- a/source/backend/cuda/execution/InterpExecution.cu
+++ b/source/backend/cuda/execution/InterpExecution.cu
@@ -1,27 +1,51 @@
 #include "InterpExecution.hpp"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {
-
 #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 template<typename T>
-__global__ void INTERP(const int n, const int ih, const int iw, const int oh, const int ow, 
+__global__ void INTERP_NERAEST(const int n, const int ih, const int iw, const int oh, const int ow, 
     const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
-    CUDA_KERNEL_LOOP(index, n) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
         int x = index % ow;
         int tmp = index / ow;
         int y = tmp % oh;
         int z = tmp / oh;
         int ix = min(max(0, (int)floor((float)x*scalew+offsetw)), iw-1);
         int iy = min(max(0, (int)floor((float)y*scaleh+offseth)), ih-1);
-        out[z*oh*ow + y*ow + x] = in[z*ih*iw + iy*iw + ix];
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
+            = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
+    }
+}
+
+template<typename T>
+__global__ void INTERP_NERAEST_ROUND(const int n, const int ih, const int iw, const int oh, const int ow, 
+    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
+        int x = index % ow;
+        int tmp = index / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int ix = min(max(0, (int)floor((float)x*scalew+offsetw + 0.499f)), iw-1);
+        int iy = min(max(0, (int)floor((float)y*scaleh+offseth + 0.499f)), ih-1);
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
+            = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
     }
 }
 
 template<typename T>
 __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const int oh, const int ow, 
     const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
-    CUDA_KERNEL_LOOP(index, n) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
         int x = index % ow;
         int tmp = index / ow;
         int y = tmp % oh;
@@ -37,11 +61,97 @@ __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const i
         int index_01 = z*ih*iw + iy_0*iw + ix_1;
         int index_10 = z*ih*iw + iy_1*iw + ix_0;
         int index_11 = z*ih*iw + iy_1*iw + ix_1;
+        index_00 = index_00 * PACK_NUMBER + remain;
+        index_01 = index_01 * PACK_NUMBER + remain;
+        index_10 = index_10 * PACK_NUMBER + remain;
+        index_11 = index_11 * PACK_NUMBER + remain;
 
         float factor_x = fx-ix_0;
         float factor_y = fy-iy_0;
-        out[z*oh*ow + y*ow + x] = (1.0-factor_x)*(1.0-factor_y)*in[index_00] + factor_x*(1.0-factor_y)*in[index_01] +
-                                  (1.0-factor_x)*factor_y*in[index_10] + factor_x*factor_y*in[index_11];
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] =
+            (1.0-factor_x)*(1.0-factor_y)*(float)in[index_00] + factor_x*(1.0-factor_y)*(float)in[index_01] +
+                                  (1.0-factor_x)*factor_y*(float)in[index_10] + factor_x*factor_y*(float)in[index_11];
+    }
+}
+
+template<typename T>
+__global__ void INTERP_BILINEAR_OPT(const int n, const int ih, const int iw, const int oh, const int ow, 
+    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out,
+    DivModFast d_ow, DivModFast d_oh) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total >> 4;
+        int remain = total & 15;
+
+        int tmp, x_idx, y, z;
+        d_ow.divmod(index, tmp, x_idx);
+        d_oh.divmod(tmp, z, y);
+
+        size_t x = x_idx << 1;
+        float fx = x*scalew+offsetw;
+        int ix_0 = min(max(0, (int)floor(fx)), iw-1);
+        int ix_1 = min((int)ceil(fx), iw-1);
+
+        float fx_1 = fx + scalew;
+        int ix_2 = min(max(0, (int)floor(fx_1)), iw-1);
+        int ix_3 = min((int)ceil(fx_1), iw-1);
+
+        float fy = y*scaleh+offseth;
+        int iy_0 = min(max(0, (int)floor(fy)), ih-1);
+        int iy_1 = min((int)ceil(fy), ih-1);
+
+        int index_00 = (z*ih+ iy_0)*iw + ix_0;
+        int index_01 = index_00 - ix_0 + ix_1;
+        int index_10 = (z*ih+ iy_1)*iw + ix_0;
+        int index_11 = index_10 - ix_0 + ix_1;
+        index_00 = (index_00 << 4) + remain;
+        index_01 = (index_01 << 4) + remain;
+        index_10 = (index_10 << 4) + remain;
+        index_11 = (index_11 << 4) + remain;
+
+        float factor_x = fx-ix_0;
+        float factor_y = fy-iy_0;
+        float in_00 = (float)in[index_00];
+        float in_01 = (float)in[index_01];
+        float in_10 = (float)in[index_10];
+        float in_11 = (float)in[index_11];
+
+        float factor_00 = (1.0-factor_x)*(1.0-factor_y);
+        float factor_01 = factor_x*(1.0-factor_y);
+        float factor_10 = (1.0-factor_x)*factor_y;
+        float factor_11 = factor_x*factor_y;
+
+        size_t dstOffset = (((z*oh+ y)*ow + x) << 4) + remain;
+        out[dstOffset] = \
+            factor_00* in_00 + factor_01*in_01 + \
+            factor_10* in_10 + factor_11*in_11;
+
+        if(x+1 >= ow) {
+            continue;
+        }
+
+        if(ix_2 != ix_0) {
+            index_00 = index_00 + ((ix_2-ix_0) << 4);
+            index_10 = index_10 + ((ix_2-ix_0) << 4);
+            in_00 = (float)in[index_00];
+            in_10 = (float)in[index_10];
+        }
+        if(ix_3 != ix_1) {
+            index_01 = index_01 + ((ix_3-ix_1) << 4);
+            index_11 = index_11 + ((ix_3-ix_1) << 4);
+            in_01 = (float)in[index_01];
+            in_11 = (float)in[index_11];
+        }
+
+        if(factor_x != fx_1-ix_2) {
+            factor_x = fx_1-ix_2;
+            factor_00 = (1.0-factor_x)*(1.0-factor_y);
+            factor_01 = factor_x*(1.0-factor_y);
+            factor_10 = (1.0-factor_x)*factor_y;
+            factor_11 = factor_x*factor_y;
+        }
+        out[dstOffset+ PACK_NUMBER] = \
+            factor_00* in_00 + factor_01*in_01 + \
+            factor_10* in_10 + factor_11*in_11;
     }
 }
 
@@ -70,7 +180,7 @@ ErrorCode InterpExecution::onResize(const std::vector<Tensor *> &inputs, const s
     mOutputHeight = output->height();
     mOutputWidth  = output->width();
 
-    mCount = mBatch*mChannel*mOutputHeight*mOutputWidth;
+    mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*mOutputWidth * PACK_NUMBER;
     //printf("mBatch:%d-mChannel:%d-mInputHeight:%d- mInputWidth:%d- mOutputHeight:%d- mOutputWidth:%d, mScaleHeight:%f- mScaleWidth:%f %f %f\n", mBatch, mChannel, mInputHeight,mInputWidth,mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mWidthOffset, mHeightOffset);
     return NO_ERROR;
 }
@@ -82,13 +192,39 @@ ErrorCode InterpExecution::onExecute(const std::vector<Tensor *> &inputs, const
     int threads_num = runtime->threads_num();
     auto input_addr = (void*)inputs[0]->deviceId();
     auto output_addr = (void*)outputs[0]->deviceId();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if(mResizeType == 1){
+            INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+        } else if(mResizeType == 2) {
+            //INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+
+            mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*((mOutputWidth+1)/ 2) * PACK_NUMBER;
+            block_num = runtime->blocks_num(mCount);
+            threads_num = runtime->threads_num();
+
+            DivModFast d_ow((mOutputWidth+1)/2);
+            DivModFast d_oh(mOutputHeight);
+            INTERP_BILINEAR_OPT<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr, d_ow, d_oh);       
+
+        } else if (mResizeType == 4) {
+            INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+        }
+        return NO_ERROR;
+    }
 
     if(mResizeType == 1){
-        INTERP<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+        INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
             mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
     } else if(mResizeType == 2) {
         INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
             mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);       
+    } else if (mResizeType == 4) {
+        INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+            mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
     }
     return NO_ERROR;
 }
@@ -98,7 +234,7 @@ public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
         auto param = op->main_as_Interp();
-        if(param->resizeType() != 1 && param->resizeType() != 2) {
+        if(param->resizeType() == 3) {
             MNN_PRINT("CUDA interp resize type:%d not support, back to CPU\n", param->resizeType());
             return nullptr;
         }
diff --git a/source/backend/cuda/execution/LayerNormExecution.cu b/source/backend/cuda/execution/LayerNormExecution.cu
index 1d9d2e03..b1da15a7 100644
--- a/source/backend/cuda/execution/LayerNormExecution.cu
+++ b/source/backend/cuda/execution/LayerNormExecution.cu
@@ -38,7 +38,7 @@ T blockReduceSum(T val)
 
 template <typename T>
 __global__ 
-void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl)
+void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)
 {
   int tid = threadIdx.x;
 
@@ -60,7 +60,7 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int
 
   float var_tmp = 0.0f;
   for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
-    var_tmp += ((input[blockIdx.x * n + idx*256 + tid] - s_mean) * (input[blockIdx.x * n + idx*256 + tid] - s_mean));
+    var_tmp += (((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * ((float)input[blockIdx.x * n + idx*256 + tid] - s_mean));
   }
   variance += blockReduceSum<float>(var_tmp);
   if(threadIdx.x == 0)
@@ -69,14 +69,14 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int
 
   for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
     out[blockIdx.x * n + idx*256+tid] = 
-        (T)(((input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
+        (T)((((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
   }
 }
 
 
 template <typename T>
 __global__ 
-void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_2048(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
   int tid = threadIdx.x;
 
@@ -128,7 +128,7 @@ void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta,
 
 template <typename T>
 __global__ 
-void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_1024(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
   int tid = threadIdx.x;
 
@@ -176,7 +176,7 @@ void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta,
 
 template <typename T>
 __global__ 
-void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_512(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
   int tid = threadIdx.x;
 
@@ -217,25 +217,25 @@ void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta,
 
 template<typename T>
 __global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon, 
-                          const T* in, T* out, const T* gamma_data, const T* beta_data) {
+                          const T* in, T* out, const float* gamma_data, const float* beta_data) {
     CUDA_KERNEL_LOOP(i, count) {
         const int o = i / inside;
         const int index = i % inside;
         const T* inner_input = in + o * inside;
         T* inner_output = out + o * inside;
-        T sum = 0.f;
+        float sum = 0.f;
         for (int j = 0; j < inside; ++j) {
-            sum += inner_input[j];
+            sum += (float)inner_input[j];
         }
-        T mean = sum / inside;
-        T square_sum = 0.f;
+        float mean = sum / inside;
+        float square_sum = 0.f;
         for (int j = 0; j < inside; ++j) {
-            square_sum += (inner_input[j] - mean) * (inner_input[j] - mean);
+            square_sum += ((float)inner_input[j] - mean) * ((float)inner_input[j] - mean);
         }
-        T variable = square_sum / inside;
+        float variable = square_sum / inside;
         variable = 1.f / sqrt(variable + epsilon);
 
-        inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
+        inner_output[index] = ((float)inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
     }
 }
 
@@ -249,7 +249,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
     mEps = layer_norm_param->epsilon();
 
     int size = layer_norm_param->gamma()->size();
-    mGammaTensor.reset(Tensor::createDevice<float>({size}));
+    mGammaTensor.reset(Tensor::createDevice<int32_t>({size}));
     auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC);
     if (!status) {
         MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n");
@@ -262,7 +262,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
     if (layer_norm_param->beta()->size() != size) {
         MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n");
     }
-    mBetaTensor.reset(Tensor::createDevice<float>({size}));
+    mBetaTensor.reset(Tensor::createDevice<int32_t>({size}));
     status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC);
     if (!status) {
         MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n");
@@ -274,12 +274,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
 
 }
 LayerNormExecution::~LayerNormExecution() {
-    if (nullptr != mGammaTensor) {
-        backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC);
-    }
-    if (nullptr != mBetaTensor) {
-        backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC);
-    }
+    // Do nothing
 }
 
 ErrorCode LayerNormExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@@ -314,6 +309,28 @@ ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, con
     int threads_num = runtime->threads_num();
     auto input_addr = (void*)inputs[0]->deviceId();
     auto output_addr = (void*)outputs[0]->deviceId();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if(mInside < 128) {
+            LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const half *)input_addr, (half *)output_addr,
+                    (const float *)mDeviceGamma, (const float *)mDeviceBeta);
+        } else {
+            if(mInside == 2048) {
+                input_layernorm_2048<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else if(mInside == 1024) {
+                input_layernorm_1024<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else if(mInside == 512) {
+                input_layernorm_512<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else {
+                int sumPerKnl = (mInside+255) / 256;
+                input_layernorm<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl);
+            }
+        }
+        return NO_ERROR;
+    }
 
     if(mInside < 128) {
         LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr,
diff --git a/source/backend/cuda/execution/CUDALoop.cpp b/source/backend/cuda/execution/LoopExecution.cpp
similarity index 88%
rename from source/backend/cuda/execution/CUDALoop.cpp
rename to source/backend/cuda/execution/LoopExecution.cpp
index cedd936e..bb66be80 100644
--- a/source/backend/cuda/execution/CUDALoop.cpp
+++ b/source/backend/cuda/execution/LoopExecution.cpp
@@ -6,7 +6,6 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include <map>
-#include "BatchMatMulExecution.hpp"
 #include "MatMulExecution.hpp"
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "Raster.cuh"
@@ -34,18 +33,21 @@ public:
             auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
             auto op = cmd->op();
             if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
-                auto& unit = mExecutions[0];
-                unit.exe.reset(new BatchMatMulExecution(op->main_as_MatMul()->transposeA(),  op->main_as_MatMul()->transposeB(), backend()));
-                if (nullptr == unit.exe) {
-                    return OUT_OF_MEMORY;
-                } 
-                unit.inputs = inputs;
-                unit.outputs = outputs;
-                auto code = unit.exe->onResize(unit.inputs, unit.outputs);
-                if (NO_ERROR != code) {
-                    return code;
+                if (inputs.size() <= 3) {
+                    auto& unit = mExecutions[0];
+                    unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(),  op->main_as_MatMul()->transposeB(), backend()));
+                    if (nullptr == unit.exe) {
+                        return OUT_OF_MEMORY;
+                    }
+                    unit.inputs = inputs;
+                    unit.outputs = outputs;
+                    auto code = unit.exe->onResize(unit.inputs, unit.outputs);
+                    if (NO_ERROR != code) {
+                        return code;
+                    }
+                    mSingleMatMul = true;
+                    return NO_ERROR;
                 }
-                return NO_ERROR;
             }
         }
 
@@ -134,21 +136,22 @@ public:
 
     virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
         auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+        if (mSingleMatMul) {
+            auto& unit = mExecutions[0];
+            unit.inputs = originInputs;
+            unit.outputs = originOutputs;
+
+            auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
+            if (NO_ERROR != code) {
+                return code;
+            }
+            return NO_ERROR;
+        }
         if (1 == mLoop->commands()->size()) {
             auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
             auto op = cmd->op();
 
-            if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
-                auto& unit = mExecutions[0];
-                unit.inputs = originInputs;
-                unit.outputs = originOutputs;
 
-                auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
-                if (NO_ERROR != code) {
-                    return code;
-                }
-                return NO_ERROR;
-            }
 
             if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
                 Tensor::InsideDescribe::Region reg;
@@ -160,7 +163,7 @@ public:
                 auto input = mStack[cmd->indexes()->data()[1]];
                 auto inputSize = input->elementSize();
                 auto output = mStack[cmd->indexes()->data()[0]];
-                auto bytes = input->getType().bytes();
+                auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
                 auto step0 = cmd->steps()->data()[0];
                 auto step1 = cmd->steps()->data()[1];
                 auto loopNumber = mLoop->loopNumber();
@@ -189,7 +192,7 @@ public:
         for (auto& iter : mIndiceCopy) {
             backend()->onCopyBuffer(iter.first, iter.second);
         }
-        auto bytes = sizeof(float);//TODO: Support Half
+        auto bytes = static_cast<CUDABackend*>(backend())->getBytes(originOutputs[0]);
         for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
             for (int index=0; index<mLoop->commands()->size(); ++index) {
                 auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
@@ -205,7 +208,7 @@ public:
                     }
                     auto view = cmd->view()->GetAs<View>(v);
                     offset = offset * cmd->steps()->data()[v] + view->offset();
-                    mStackPtr[tensorIndex] = tensor->deviceId() + offset * bytes;
+                    mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
                 }
                 if (OpType_UnaryOp == op->type()) {
                     auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
@@ -233,6 +236,10 @@ public:
                     continue;
                 }
                 if (OpType_BinaryOp == op->type()) {
+                    auto type = halide_type_of<float>();
+                    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+                        type.bits = 16;
+                    }
                     auto src0 = mStackPtr[cmd->indexes()->data()[1]];
                     auto src1 = mStackPtr[cmd->indexes()->data()[2]];
                     auto dst = mStackPtr[cmd->indexes()->data()[0]];
@@ -242,7 +249,7 @@ public:
                     auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
 
                     BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
-                        cmd->size()->data(), srcStride0, srcStride1, dstStride, halide_type_of<float>(), runtime, opType);
+                        cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);
 
                 }
             }
@@ -256,6 +263,7 @@ private:
     std::vector<Unit> mExecutions;
     std::vector<uint64_t> mStackPtr;
     std::map<Tensor*, Tensor*> mIndiceCopy;
+    bool mSingleMatMul = false;
 };
 
 class LoopCreator : public CUDABackend::Creator {
diff --git a/source/backend/cuda/execution/MNNCUDADefine.hpp b/source/backend/cuda/execution/MNNCUDADefine.hpp
new file mode 100644
index 00000000..71992c39
--- /dev/null
+++ b/source/backend/cuda/execution/MNNCUDADefine.hpp
@@ -0,0 +1,18 @@
+#ifndef MNNCUDADEFINE_HPP
+#define MNNCUDADEFINE_HPP
+
+#define PACK_NUMBER 16
+
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#define MNN_CUDA_HALF2_MIN(a, b)                     \
+    do {                                             \
+        (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#endif
diff --git a/source/backend/cuda/execution/MNNCUDAFunction.cuh b/source/backend/cuda/execution/MNNCUDAFunction.cuh
new file mode 100644
index 00000000..9585d60c
--- /dev/null
+++ b/source/backend/cuda/execution/MNNCUDAFunction.cuh
@@ -0,0 +1,38 @@
+#ifndef MNNCUDAFunction_cuh
+#define MNNCUDAFunction_cuh
+
+struct DivModFast {
+    DivModFast(int d = 1)
+    {
+        d_ = (d == 0) ? 1 : d;
+        for (l_ = 0;; ++l_) {
+            if ((1U << l_) >= d_)
+                break;
+        }
+        uint64_t one = 1;
+        uint64_t m   = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
+        m_           = static_cast<uint32_t>(m);
+    }
+
+    __device__ __inline__ int div(int idx) const
+    {
+        uint32_t tm = __umulhi(m_, idx); // get high 32-bit of the product
+        return (tm + idx) >> l_;
+    }
+
+    __device__ __inline__ int mod(int idx) const
+    {
+        return idx - d_ * div(idx);
+    }
+
+    __device__ __inline__ void divmod(int idx, int &quo, int &rem)
+    {
+        quo = div(idx);
+        rem = idx - quo * d_;
+    }
+
+    uint32_t d_; // divisor
+    uint32_t l_; // ceil(log2(d_))
+    uint32_t m_; // m' in the papaer
+};
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/MatMulExecution.cu b/source/backend/cuda/execution/MatMulExecution.cu
index f285af79..1bca5a98 100644
--- a/source/backend/cuda/execution/MatMulExecution.cu
+++ b/source/backend/cuda/execution/MatMulExecution.cu
@@ -15,12 +15,18 @@ MatMulExecution::~ MatMulExecution() {
 }
 
 ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto w0         = inputs[0]->length(1);
-    auto h0         = inputs[0]->length(0);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     auto C = outputs[0];
+    auto dimensions = C->dimensions();
+    int batch = 1;
+    for (int i = 0; i < dimensions - 2; ++i) {
+        batch *= C->length(i);
+    }
+    auto e = C->length(dimensions-2);
+    auto h = C->length(dimensions-1);
+    auto w0 = inputs[0]->length(dimensions-1);
+    auto h0 = inputs[0]->length(dimensions-2);
 
-    auto e = C->length(0);
-    auto h = C->length(1);
     auto l = w0;
     if (mTransposeA) {
         l = h0;
@@ -29,6 +35,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
     param.elh[0] = e;
     param.elh[1] = l;
     param.elh[2] = h;
+    param.batch = batch;
     auto eU = UP_DIV(e, PACK_MATMUL);
     auto lU = UP_DIV(l, PACK_MATMUL);
     auto hU = UP_DIV(h, PACK_MATMUL);
@@ -58,15 +65,17 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
     param.cStride[0] = h;
     param.cStride[1] = 0;
     param.cStride[2] = 1;
-    param.split[0] = 1;
-    param.split[1] = 1;
-    param.split[2] = 1;
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    param.aPStride[0] = 256 * lU;
+    param.aPStride[1] = 16;
+    param.aPStride[2] = 16 * lU;
+    param.bPStride[0] = 256 * lU;
+    param.bPStride[1] = 16;
+    param.bPStride[2] = 16 * lU;
     runtime->memcpy((uint8_t*)mParameters.first + mParameters.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
 
     // Alloc for temp buffer
-    auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL;
-    auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL;
+    auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL * batch;
+    auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL * batch;
 
     auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
     mTempA = pool->alloc(aPackSize * sizeof(__half), false, 256);
@@ -85,6 +94,11 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
     auto APtr = (const float*)A->deviceId();
     auto BPtr = (const float*)B->deviceId();
     auto CDestPtr = (float*)C->deviceId();
+    int e = mParam.elh[0];
+    int l = mParam.elh[1];
+    int h = mParam.elh[2];
+    int batch = mParam.batch;
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
 
     auto aP = (__half*)((uint8_t*)mTempA.first + mTempA.second);
     auto bP = (__half*)((uint8_t*)mTempB.first + mTempB.second);
@@ -93,53 +107,8 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
         biasPtr = (const float*)inputs[2]->deviceId();
     }
     auto param = (MatMulParam*)((uint8_t*)mParameters.first + mParameters.second);
-    GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP);
-    GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr);
-    return NO_ERROR;
-
-    auto blasHandle = runtime->cublas_handle();
-    auto w0         = inputs[0]->length(1);
-    auto h0         = inputs[0]->length(0);
-
-    auto e = C->length(0);
-    auto h = C->length(1);
-    auto l = w0;
-    if (mTransposeA) {
-        l = h0;
-    }
-
-    float alpha = 1.0f;
-    float beta = 0.0f;
-
-    auto tranB = CUBLAS_OP_N;
-    auto ldB = h;
-    if (mTransposeB) {
-        ldB = l;
-        tranB = CUBLAS_OP_T;
-    }
-    auto tranA = CUBLAS_OP_N;
-    auto ldA = l;
-    if (mTransposeA) {
-        ldA = e;
-        tranA = CUBLAS_OP_T;
-    }
-    int block_num = runtime->blocks_num(e*h);
-    int threads_num = runtime->threads_num();
-    
-    //[e, l] x [l, h] -> [e, h]
-        auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-    // } else {
-    //     auto CPtr = (float*)mTempOutput->deviceId();
-    //     auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h);
-    //     cublas_check(status);
-    //     //cudaThreadSynchronize();
-
-    //     //bias: [e, h] + [h] -> [e, h]
-    //     add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
-    // }
-
+    GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP, bytes);
+    GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr, bytes, false, false);
     return NO_ERROR;
 }
 
diff --git a/source/backend/cuda/execution/MatMulExecution.hpp b/source/backend/cuda/execution/MatMulExecution.hpp
index d1aa95b6..4c24b75f 100644
--- a/source/backend/cuda/execution/MatMulExecution.hpp
+++ b/source/backend/cuda/execution/MatMulExecution.hpp
@@ -28,6 +28,7 @@ private:
     std::pair<void*, int> mTempB;
     std::pair<void*, int> mParameters; // In GPU
     MatMulParam mParam; // In CPU
+    bool mUseBlas = false;
 };
 } // namespace CUDA
 } // namespace MNN
diff --git a/source/backend/cuda/execution/PReLUExecution.cu b/source/backend/cuda/execution/PReLUExecution.cu
index c0a80b49..8f3efb22 100644
--- a/source/backend/cuda/execution/PReLUExecution.cu
+++ b/source/backend/cuda/execution/PReLUExecution.cu
@@ -1,62 +1,71 @@
 #include "PReLUExecution.hpp"
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
-
 #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 template<typename T>
 __global__ void PRELU(const int n, const int channels, const int dim, const T* in, T* out,
-                        const T* slopeData, int div_factor) {
-    CUDA_KERNEL_LOOP(index, n) {
+                        const float* slopeData, int div_factor) {
+    CUDA_KERNEL_LOOP(t, n) {
+        int index = t / PACK_NUMBER;
+        int r = t % PACK_NUMBER;
         int c      = (index / dim) % channels / div_factor;
-        out[index] = in[index] > 0 ? in[index] : in[index]*slopeData[c];
+        float iv = (float)in[t];
+        float ov = iv > 0.0 ? iv : iv * slopeData[c * PACK_NUMBER + r];
+        out[t] = (T)ov;
     }
 }
 
 PReLUExecution::PReLUExecution(const PRelu* prelu, Backend *backend) : Execution(backend) {
     int slopCount = prelu->slope()->size();
     auto alphaData = prelu->slope()->data();
-    preluTensor.reset(Tensor::createDevice<float>({slopCount}));
-    backend->onAcquireBuffer(preluTensor.get(), Backend::STATIC);
-    mDeviceSlope = (void *)preluTensor.get()->buffer().device;
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    auto slopeSize = UP_DIV(slopCount, PACK_NUMBER) * PACK_NUMBER * sizeof(float);
+    mPreluStorage = staticPool->alloc(slopeSize);
+    mDeviceSlope = (uint8_t*)mPreluStorage.first + mPreluStorage.second;
 
     MNN_ASSERT(nullptr != mDeviceSlope);
+    cudaMemset(mDeviceSlope, 0, slopeSize);
     cudaMemcpy(mDeviceSlope, alphaData, slopCount * sizeof(float), cudaMemcpyHostToDevice);
     mIsChannelShared = slopCount == 1;
-
 }
 PReLUExecution::~PReLUExecution() {
-    if (nullptr != preluTensor) {
-        backend()->onReleaseBuffer(preluTensor.get(), Backend::STATIC);
-    }
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mPreluStorage);
 }
 
 ErrorCode PReLUExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
     auto input = inputs[0];
-    mBatch     = input->length(0);
-    mChannel   = input->length(1);
     MNN_ASSERT(input->dimensions() >= 2);
-    mArea      = 1;
+    mArea      = input->length(0);
     for (int i = 2; i < input->dimensions(); ++i) {
         mArea *= input->length(i);
     }
-    mCount = mBatch*mChannel*mArea;
+    mChannel = UP_DIV(input->length(1), PACK_NUMBER);
+    mCount = mChannel*mArea * PACK_NUMBER;
     //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount);
     return NO_ERROR;
 }
 
 ErrorCode PReLUExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
  
     int block_num = runtime->blocks_num(mCount);
     int threads_num = runtime->threads_num();
     auto input_addr = (void*)inputs[0]->deviceId();
     auto output_addr = (void*)outputs[0]->deviceId();
     int div_factor = mIsChannelShared ? mChannel : 1;
-    PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
-        (const float *)mDeviceSlope, div_factor);
+    if (2 == bytes) {
+        PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr,
+            (const float *)mDeviceSlope, div_factor);
+    } else {
+        PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
+            (const float *)mDeviceSlope, div_factor);
+    }
     return NO_ERROR;
 }
 
diff --git a/source/backend/cuda/execution/PReLUExecution.hpp b/source/backend/cuda/execution/PReLUExecution.hpp
index 785db589..8f121187 100644
--- a/source/backend/cuda/execution/PReLUExecution.hpp
+++ b/source/backend/cuda/execution/PReLUExecution.hpp
@@ -29,11 +29,9 @@ private:
     CUDARuntime *mRuntime;
     void *mDeviceSlope = nullptr;
     int mCount;
-    int mBatch;
     int mChannel;
     int mArea;
-
-    std::shared_ptr<Tensor> preluTensor;
+    std::pair<void*, int> mPreluStorage;
     bool mIsChannelShared = false;
 };
 
diff --git a/source/backend/cuda/execution/PoolExecution.cu b/source/backend/cuda/execution/PoolExecution.cu
index 2ea3bd1c..483c399c 100755
--- a/source/backend/cuda/execution/PoolExecution.cu
+++ b/source/backend/cuda/execution/PoolExecution.cu
@@ -1,90 +1,209 @@
+#include <cuda_fp16.h>
 #include "PoolExecution.hpp"
+#include <float.h>
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
-template <typename T>
-__global__ void avgpool(const T* uInput, T* uOutput,
-        int bc,
-        int ih, int iw,
-        int oh, int ow,
-        int padX, int padY,
-        int kernelX, int kernelY,
-        int strideX, int strideY
-        ) {
-    int total = bc * oh * ow;
+#define HALF_MIN  half(-65504)
+#define HALF2_MIN half2(-65504, -65504)
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+__global__ void maxpool_halfC16(const half* uInput, half* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * 8;
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
         int x = i % ow;
         int tmp = i / ow;
         int y = tmp % oh;
         int z = tmp / oh;
+        int zC = z / 8;
+        int zR = z % 8;
         int ix = x * strideX - padX;
         int iy = y * strideY - padY;
         int sx = max(0, -ix);
         int sy = max(0, -iy);
         int ex = min(kernelX, iw - ix);
         int ey = min(kernelY, ih - iy);
-        T sumValue = (T)0;
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        half2 sumValue = HALF2_MIN;
         for (int fy=sy; fy<ey; ++fy) {
-            for (int fx=sx; fx<ex; ++fx)
-            {
+            for (int fx=sx; fx<ex; ++fx) {
                 int currentX = ix + fx;
                 int currentY = iy + fy;
-                T inputColor = uInput[0
-                + z * iw * ih
-                + currentY * iw
-                + currentX
-                ];
-                sumValue = sumValue + inputColor;
+                const half2* input = (const half2*)(uInput
+                    + zR * 2
+                    + currentX * 16
+                    + currentY * iw * 16
+                    + zC * iw * ih * 16
+                );
+                half2 inputV = *input;
+                MNN_CUDA_HALF2_MAX(sumValue, inputV);
             }
         }
-        uOutput[0
-            + z * ow * oh
-            + y * ow
-            + x
-        ] = sumValue / ((T)(ey-sy)*(T)(ex-sx));
+        half2* dst = (half2*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR * 2
+        );
+        *dst = sumValue;
     }
 }
-template <typename T>
-__global__ void maxpool(const T* uInput, T* uOutput,
-        int bc,
-        int ih, int iw,
-        int oh, int ow,
-        int padX, int padY,
-        int kernelX, int kernelY,
-        int strideX, int strideY
-        ) {
-    int total = bc * oh * ow;
+
+__global__ void avgpool_halfC16(const half* uInput, half* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * 8;
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
         int x = i % ow;
         int tmp = i / ow;
         int y = tmp % oh;
         int z = tmp / oh;
+        int zC = z / 8;
+        int zR = z % 8;
         int ix = x * strideX - padX;
         int iy = y * strideY - padY;
         int sx = max(0, -ix);
         int sy = max(0, -iy);
         int ex = min(kernelX, iw - ix);
         int ey = min(kernelY, ih - iy);
-        T maxValue = (T)(-1000000);
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        half2 sumValue = half2(0.0f, 0.0f);
+        half2 mulValue = half2(1.0f / div, 1.0f/div);
         for (int fy=sy; fy<ey; ++fy) {
-            for (int fx=sx; fx<ex; ++fx)
-            {
+            for (int fx=sx; fx<ex; ++fx) {
                 int currentX = ix + fx;
                 int currentY = iy + fy;
-                T inputColor = uInput[0
-                + z * iw * ih
-                + currentY * iw
-                + currentX
-                ];
-                maxValue = max(inputColor, maxValue);
+                const half2* input = (const half2*)(uInput
+                    + zR * 2
+                    + currentX * 16
+                    + currentY * iw * 16
+                    + zC * iw * ih * 16
+                );
+                sumValue = __hadd2(sumValue, (*input) * mulValue);
             }
         }
-        uOutput[0
-            + z * ow * oh
-            + y * ow
-            + x
-        ] = maxValue;
+        half2* dst = (half2*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR * 2
+        );
+        *dst = sumValue;
     }
 }
+
+
+
+__global__ void maxpool_floatC16(const float* uInput, float* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * PACK_NUMBER;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int x = i % ow;
+        int tmp = i / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int zC = z / PACK_NUMBER;
+        int zR = z % PACK_NUMBER;
+        int ix = x * strideX - padX;
+        int iy = y * strideY - padY;
+        int sx = max(0, -ix);
+        int sy = max(0, -iy);
+        int ex = min(kernelX, iw - ix);
+        int ey = min(kernelY, ih - iy);
+        float maxValue = -FLT_MAX;
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = ix + fx;
+                int currentY = iy + fy;
+                const float* input = (const float*)(uInput
+                    + zR
+                    + currentX * PACK_NUMBER
+                    + currentY * iw * PACK_NUMBER
+                    + zC * iw * ih * PACK_NUMBER
+                );
+                maxValue = max(maxValue, *input);
+            }
+        }
+        float* dst = (float*)(uOutput
+            + zC * ow * oh * PACK_NUMBER
+            + y * ow * PACK_NUMBER
+            + x * PACK_NUMBER
+            + zR
+        );
+        *dst = maxValue;
+    }
+}
+
+__global__ void avgpool_floatC16(const float* uInput, float* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * PACK_NUMBER;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int x = i % ow;
+        int tmp = i / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int zC = z / PACK_NUMBER;
+        int zR = z % PACK_NUMBER;
+        int ix = x * strideX - padX;
+        int iy = y * strideY - padY;
+        int sx = max(0, -ix);
+        int sy = max(0, -iy);
+        int ex = min(kernelX, iw - ix);
+        int ey = min(kernelY, ih - iy);
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        float sumValue = 0.0f;
+        float mulValue = 1.0f/div;
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = ix + fx;
+                int currentY = iy + fy;
+                const float* input = (const float*)(uInput
+                    + zR
+                    + currentX * PACK_NUMBER
+                    + currentY * iw * PACK_NUMBER
+                    + zC * iw * ih * PACK_NUMBER
+                );
+                sumValue = sumValue + (*input) * mulValue;
+            }
+        }
+        float* dst = (float*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR
+        );
+        *dst = sumValue;
+    }
+}
+
 ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto layer       = mParameter;
     int strideWidth  = layer->strideX();
@@ -128,34 +247,62 @@ ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std
 ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto iw = inputs[0]->width();
     auto ih = inputs[0]->height();
-    auto bc = inputs[0]->batch() * inputs[0]->channel();
+    auto bc = inputs[0]->batch() * UP_DIV(inputs[0]->channel(), PACK_NUMBER);
     auto ow = outputs[0]->width();
     auto oh = outputs[0]->height();
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    int block_num = runtime->blocks_num(bc * ow * oh);
-    int threads_num = runtime->threads_num();
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        auto inputPtr = (const half*)inputs[0]->deviceId();
+        auto outputPtr = (half*)outputs[0]->deviceId();
+        switch (mPoolType) {
+            case PoolType_AVEPOOL:
+                avgpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    bc, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+            case PoolType_MAXPOOL:
+                maxpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    bc, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+        }        
+        return NO_ERROR;
+    }
     auto inputPtr = (const float*)inputs[0]->deviceId();
     auto outputPtr = (float*)outputs[0]->deviceId();
     switch (mPoolType) {
         case PoolType_AVEPOOL:
-            avgpool<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+            avgpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
                 bc, 
                 ih, iw,
                 oh, ow,
                 mPaddings[0], mPaddings[1],
                 mKernels[0], mKernels[1],
                 mStrides[0], mStrides[1]
-                );
+            );
             return NO_ERROR;
         case PoolType_MAXPOOL:
-            maxpool<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+            maxpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
                 bc, 
                 ih, iw,
                 oh, ow,
                 mPaddings[0], mPaddings[1],
                 mKernels[0], mKernels[1],
                 mStrides[0], mStrides[1]
-                );
+            );
             return NO_ERROR;
     }
     return NOT_SUPPORT;
diff --git a/source/backend/cuda/execution/Raster.cu b/source/backend/cuda/execution/Raster.cu
index ec0b4d07..2fcd479c 100644
--- a/source/backend/cuda/execution/Raster.cu
+++ b/source/backend/cuda/execution/Raster.cu
@@ -1,89 +1,22 @@
 #include "Raster.cuh"
 #include "TensorflowOp_generated.h"
+#include <cuda_fp16.h>
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {
 
-template <typename T>
-__global__ void pack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
-    int total = inside * axis * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
-        int x = i % inside;
-        int tmp = i / inside;
-        int y = tmp % axis;
-        int z = tmp / axis;
-        int y4 = y / 4;
-        int yR = y % 4;
-        int dstOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
-        output[dstOffset] = input[i];
-    }
-}
-
-void PackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
-    auto packAxis = (axis + 3) / 4;
-    if (axis % 4 != 0) {
-        runtime->memset(output, 0, inside * packAxis * 4 * outside * bytes);
-    }
-    int block_num = runtime->blocks_num(inside * axis * outside);
-    int threads_num = runtime->threads_num();
-    switch (bytes) {
-        case 4:
-            pack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
-            break;
-        case 2:
-            pack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
-            break;
-        case 1:
-            pack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
-            break;
-        default:
-            break;
-    }
-}
-
-template <typename T>
-__global__ void unpack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
-    int total = inside * axis * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
-        int x = i % inside;
-        int tmp = i / inside;
-        int y = tmp % axis;
-        int z = tmp / axis;
-        int y4 = y / 4;
-        int yR = y % 4;
-        int srcOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
-        output[i] = input[srcOffset];
-    }
-}
-void UnpackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
-    auto packAxis = (axis + 3) / 4;
-    int block_num = runtime->blocks_num(inside * axis * outside);
-    int threads_num = runtime->threads_num();
-    switch (bytes) {
-        case 4:
-            unpack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
-            break;
-        case 2:
-            unpack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
-            break;
-        case 1:
-            unpack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
-            break;
-        default:
-            break;
-    }
-}
-
 // Blit don't care offset
 template <typename T>
 __global__ void blitRegion(const T *inputO, T *outputO,
-        int loopCount,
-        const int32_t* dstIndice, const int32_t* srcIndice,
-        int dstUseIndice, int srcUseIndice,
-        int dstStep, int srcStep,int srcLimit,
-        int sizeZ, int sizeY, int sizeX,
-        int strideZ, int strideY, int strideX,
-        int dstStrideZ, int dstStrideY, int dstStrideX
-        ) {
+    int loopCount,
+    const int32_t* dstIndice, const int32_t* srcIndice,
+    int dstUseIndice, int srcUseIndice,
+    int dstStep, int srcStep,int srcLimit,
+    int sizeZ, int sizeY, int sizeX,
+    int strideZ, int strideY, int strideX,
+    int dstStrideZ, int dstStrideY, int dstStrideX
+    ) {
     int total = loopCount;
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
         int srcOffsetO = i * srcStep;
@@ -162,29 +95,66 @@ void BlitWithIndice(uint8_t* output, const uint8_t* input, const int32_t* dstInd
 #define UNARY_FUNC(Name, Func)\
 template<typename T>\
 __global__ void Name(const T *input, T *output,\
-        int sizeZ, int sizeY, int sizeX,\
+        int count,\
+        DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
         int strideZ, int strideY, int strideX,\
         int dstStrideZ, int dstStrideY, int dstStrideX\
         ) { \
-  int count = sizeZ * sizeY * sizeX;\
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
-    int total = sizeZ * sizeY * sizeX;\
-    int ix = i % sizeX;\
-    int tmp = i / sizeX;\
-    int iy = tmp % sizeY;\
-    int iz = tmp / sizeY;\
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
+    int ix, tmp, iy, iz;\
+    sizeX.divmod(i, tmp, ix);\
+    sizeY.divmod(tmp, iz, iy);\
     int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
     int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
     T x = input[srcOffset];\
     output[dstOffset] = Func;\
   }\
 }\
+template<typename T>\
+__global__ void FLOAT##Name(const T *input, T *output,\
+        int count,\
+        DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
+        int strideZ, int strideY, int strideX,\
+        int dstStrideZ, int dstStrideY, int dstStrideX\
+        ) { \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
+    int ix, tmp, iy, iz;\
+    sizeX.divmod(i, tmp, ix);\
+    sizeY.divmod(tmp, iz, iy);\
+    int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+    int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+    float x = (float)input[srcOffset];\
+    output[dstOffset] = (float)(Func);\
+  }\
+}\
+
+template<typename T>
+__global__ void blit_2(const T *input, T *output,
+    int count,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) { 
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, iz;
+        sizeX.divmod(i, tmp, ix);
+        sizeY.divmod(tmp, iz, iy);
+        int srcOffset = iz * strideZ + iy * strideY + (ix << 1);
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + (ix << 1);
+        int2 * dstF = (int2 *)(output+dstOffset);
+        dstF[0] = ((int2 *)(input+srcOffset))[0];
+    }
+}
+
+struct Bytes512 {
+    int4 x[4];
+};
 
 UNARY_FUNC(blit, x);
 UNARY_FUNC(ABS, abs(x));
 UNARY_FUNC(EXP, exp(x));
 UNARY_FUNC(NEG, -x);
-UNARY_FUNC(RECIPROCAL, (T)(1.0)/x);
+UNARY_FUNC(RECIPROCAL, (1.0)/x);
 UNARY_FUNC(FLOOR, floor(x));
 UNARY_FUNC(CEIL, ceil(x));
 UNARY_FUNC(SQUARE, x*x);
@@ -212,27 +182,68 @@ UNARY_FUNC(HARDSWISH, 1.0/6.0 * x * min(max(x+3.0, 0.0), 6.0));
 UNARY_FUNC(ERF, erf(x));
 UNARY_FUNC(ERFC, erfc(x));
 UNARY_FUNC(ERFINV, erfinv(x));
+UNARY_FUNC(GELU, (1.0f + tanh(0.79788458f * (0.044715f * x * x * x + x))) * x * 0.5f);
+UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);
 
 void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
     int count = size[0] * size[1] * size[2];
+
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
+
+    //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+    if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
+        //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+        count /= 2;
+        int block_num = runtime->blocks_num(count);
+        int threads_num = runtime->threads_num();
+        DivModFast sx_2((size[2]/2));
+
+        blit_2<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+            count,
+            sz, sy, sx_2,
+            srcStride[0], srcStride[1],
+            dstStride[0], dstStride[1]);
+        return;
+    }
+    
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
+
     switch (bytes) {
+        case 64:
+            blit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output,
+                count,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
+        case 32:
+            blit<<<block_num, threads_num>>>((const double4*)input, (double4*)output,
+                count,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
         case 4:
             blit<<<block_num, threads_num>>>((const float*)input, (float*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
         case 2:
             blit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
         case 1:
             blit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
@@ -241,59 +252,131 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
     }
 }
 
-template<typename T>
-__global__ void fuseblit(const T *input, T *output,
-        int fuseNum, const int32_t* sliceOffset,
-        int sizeZ, int sizeY, int sizeX,
-        int strideZ, int strideY, int strideX,
-        int dstStrideZ, int dstStrideY, int dstStrideX
-        ) {
-    int count = fuseNum*sizeZ * sizeY * sizeX;
-
-    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
-        int j = c / (sizeZ * sizeY * sizeX);
-        int i = c % (sizeZ * sizeY * sizeX);
-        int ix = i % sizeX;
-        int tmp = i / sizeX;
-        int iy = tmp % sizeY;
-        int iz = tmp / sizeY;
+template<typename T0, typename T1>
+__global__ void fuseblit(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY, int strideX,
+    int dstStrideZ, int dstStrideY, int dstStrideX
+    ) {
+    size_t c = blockIdx.x * blockDim.x + threadIdx.x;
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
         int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + ix * strideX;
         int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
         output[dst_offset] = input[src_offset];
     }
+}
 
+template<typename T0, typename T1>
+__global__ void fuseblit_4(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) {
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
+        int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
+        int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
+        int4* srcF = (int4 *)(input + src_offset);
+        int4* dstF = (int4 *)(output + dst_offset);
+        dstF[0] = srcF[0];
+    }
+}
 
+template<typename T0, typename T1>
+__global__ void fuseblit_half_4(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) {
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
+        int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
+        int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
+        int2* srcF = (int2 *)(input + src_offset);
+        int2* dstF = (int2 *)(output + dst_offset);
+        dstF[0] = srcF[0];
+    }
 }
 
 void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime) {
-    int count = size[0] * size[1] * size[2];
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
+
+    int count = fuseNum * size[0] * size[1] * size[2];
+    if(size[2] % 4 == 0 && count > 16384 && srcStride[2] == 1 && dstStride[2] == 1) {
+        //printf("%d-%d-%d, %d-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], dstStride[0], dstStride[1]);
+        int count = fuseNum * size[0] * size[1] * size[2] / 4;
+        int numBlocks = runtime->blocks_num(count);
+        int threadsPerBlock = runtime->threads_num();
+        DivModFast sx_4((size[2]/4));
+
+        if(bytes == 4) {
+            fuseblit_4<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx_4,
+                srcStride[0], srcStride[1],
+                dstStride[0], dstStride[1]);
+            return;
+        } else if(bytes == 2){
+            fuseblit_half_4<<<numBlocks, threadsPerBlock>>>((const half*)input, (half*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx_4,
+                srcStride[0], srcStride[1],
+                dstStride[0], dstStride[1]);
+            return;
+        }
+    }
+
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
 
-    int numBlocks = block_num;
-    int threadsPerBlock = threads_num;
-    // dim3 numBlocks(block_num, fuseNum);
-    // dim3 threadsPerBlock(threads_num, 1);
-
     switch (bytes) {
+        case 64:
+            fuseblit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
+        case 16:
+            fuseblit<<<block_num, threads_num>>>((const int4*)input, (int4*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
         case 4:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output, 
-                fuseNum, (const int32_t*)sliceOffset,
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
         case 2:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const int16_t*)input, (int16_t*)output,
-                fuseNum, (const int32_t*)sliceOffset,
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
         case 1:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const int8_t*)input, (int8_t*)output,
-                fuseNum, (const int32_t*)sliceOffset, 
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                 srcStride[0], srcStride[1], srcStride[2],
                 dstStride[0], dstStride[1], dstStride[2]);
             break;
@@ -303,18 +386,112 @@ void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size,
     //printf("%s, %d-%d-%d-%d\n", cudaGetErrorString(cudaGetLastError()), numBlocks.x, numBlocks.y, threadsPerBlock.x, threadsPerBlock.y);
 }
 
+template<typename T0, typename T1>
+__global__ void fuseblitLimit(const T0 *input, T1 *output,
+    const FuseRegion* info, const int32_t* sliceOffset
+    ) {
+    int sizeZ = info->size[0];
+    int sizeY = info->size[1];
+    int sizeX = info->size[2];
+    int strideZ = info->srcStride[0];
+    int strideY = info->srcStride[1];
+    int strideX = info->srcStride[2];
+    int dstStrideZ = info->dstStride[0];
+    int dstStrideY = info->dstStride[1];
+    int dstStrideX = info->dstStride[2];
+    int fuseNum = info->fuseNumber;
+
+    int count = fuseNum*sizeZ * sizeY * sizeX;
+
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
+        int j = c / (sizeZ * sizeY * sizeX);
+        int i = c % (sizeZ * sizeY * sizeX);
+        int ix = i % sizeX;
+        int tmp = i / sizeX;
+        int iy = tmp % sizeY;
+        int iz = tmp / sizeY;
+        const int* srcOffsetPtr = sliceOffset + 8 * j;
+        const int* dstOffsetPtr = sliceOffset + 8 * j + 4;
+        T0 srcValue = (T0)0;
+        int src_offset = srcOffsetPtr[3] + iz * strideZ + iy * strideY + ix * strideX;
+        if (srcOffsetPtr[0] > iz && srcOffsetPtr[1] > iy && srcOffsetPtr[2] > ix) {
+            srcValue = input[src_offset];
+        }
+        int dst_offset = dstOffsetPtr[3] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
+        //printf("%d -> %d - %f\n", src_offset, dst_offset, srcValue);
+        if (dstOffsetPtr[0] > iz && dstOffsetPtr[1] > iy && dstOffsetPtr[2] > ix) {
+            output[dst_offset] = srcValue;
+        }
+    }
+}
+void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (half*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (float*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+
+void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    switch (bytes) {
+        case 4:
+            fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        case 2:
+            fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (half*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        case 1:
+            fuseblitLimit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        default:
+            break;
+    }
+}
+
+
 void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
     int count = size[0] * size[1] * size[2];
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
     // TODO: Support FP16
-    MNN_ASSERT(bytes==4);
     #define COMPUTE(TYPE)\
     if (opType == MNN::UnaryOpOperation_##TYPE ) {\
-            TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
-                size[0], size[1], size[2],\
+        if(bytes==2) {\
+            FLOAT##TYPE<<<block_num, threads_num>>>((const half*)input, (half*)output,\
+                count, \
+                sz, sy, sx,\
                 srcStride[0], srcStride[1], srcStride[2],\
                 dstStride[0], dstStride[1], dstStride[2]);\
+        } else {\
+            TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
+                count, \
+                sz, sy, sx,\
+                srcStride[0], srcStride[1], srcStride[2],\
+                dstStride[0], dstStride[1], dstStride[2]);\
+        }\
         return;\
     }\
 
@@ -330,6 +507,8 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
     COMPUTE(SIN);
     COMPUTE(COS);
     COMPUTE(TAN);
+    COMPUTE(GELU);
+    COMPUTE(GELU_STANDARD);
     COMPUTE(ASIN);
     COMPUTE(ACOS);
     COMPUTE(ATAN);
@@ -356,26 +535,126 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
 #define BINARY_FUNC(Name, Func)\
 template<typename TIn, typename TOut>\
 __global__ void Binary##Name(\
-        const TIn *input0, const TIn* input1, TOut *output,\
-        int sizeZ, int sizeY, int sizeX,\
-        int strideZ, int strideY, int strideX,\
-        int strideZ1, int strideY1, int strideX1,\
-        int dstStrideZ, int dstStrideY, int dstStrideX\
-        ) { \
-  int count = sizeZ * sizeY * sizeX;\
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
-    int total = sizeZ * sizeY * sizeX;\
-    int ix = i % sizeX;\
-    int tmp = i / sizeX;\
-    int iy = tmp % sizeY;\
-    int iz = tmp / sizeY;\
-    int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
-    int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
-    int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
-    TIn x = input0[srcOffset];\
-    TIn y = input1[srcOffset1];\
-    output[dstOffset] = (TOut)Func;\
-  }\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ, int sizeY, int sizeX,\
+    int strideZ, int strideY, int strideX,\
+    int strideZ1, int strideY1, int strideX1,\
+    int dstStrideZ, int dstStrideY, int dstStrideX\
+    ) { \
+    int count = sizeZ * sizeY * sizeX;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int total = sizeZ * sizeY * sizeX;\
+        int ix = i % sizeX;\
+        int tmp = i / sizeX;\
+        int iy = tmp % sizeY;\
+        int iz = tmp / sizeY;\
+        int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+        int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+        TIn x = input0[srcOffset];\
+        TIn y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)Func;\
+    }\
+}\
+
+#define BINARY_FUNC_FLOATMID(Name, Func)\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMid##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ, int sizeY, int sizeX,\
+    int strideZ, int strideY, int strideX,\
+    int strideZ1, int strideY1, int strideX1,\
+    int dstStrideZ, int dstStrideY, int dstStrideX\
+    ) { \
+    int count = sizeZ * sizeY * sizeX;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int total = sizeZ * sizeY * sizeX;\
+        int ix = i % sizeX;\
+        int tmp = i / sizeX;\
+        int iy = tmp % sizeY;\
+        int iz = tmp / sizeY;\
+        int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+        int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+        float x = input0[srcOffset];\
+        float y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)(Func);\
+    }\
+}\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinear##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ,\
+    int strideZ,\
+    int strideZ1,\
+    int dstStrideZ\
+    ) { \
+    int count = sizeZ;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz * strideZ;\
+        int srcOffset1 = iz * strideZ1;\
+        int dstOffset = iz * dstStrideZ;\
+        float x = input0[srcOffset];\
+        float y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)(Func);\
+    }\
+}\
+
+#define BINARY_FUNC_FLOATMID4(Name, Func)\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinear4_##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int count_4\
+    ) { \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz << 2;\
+        int srcOffset1 = iz << 2;\
+        int dstOffset = iz << 2;\
+        float4 xx = ((float4 *)(input0+srcOffset))[0];\
+        float4 yy = ((float4 *)(input1+srcOffset1))[0];\
+        float x = xx.x;\
+        float y = yy.x;\
+        output[dstOffset] = (TOut)(Func);\
+        x = xx.y;\
+        y = yy.y;\
+        output[dstOffset+1] = (TOut)(Func);\
+        x = xx.z;\
+        y = yy.z;\
+        output[dstOffset+2] = (TOut)(Func);\
+        x = xx.w;\
+        y = yy.w;\
+        output[dstOffset+3] = (TOut)(Func);\
+    }\
+}\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinearHalf4_##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int count_4\
+    ) { \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz << 2;\
+        int srcOffset1 = iz << 2;\
+        int dstOffset = iz << 2;\
+        half2 xx = ((half2 *)(input0+srcOffset))[0];\
+        half2 yy = ((half2 *)(input1+srcOffset1))[0];\
+        float x = (float)xx.x;\
+        float y = (float)yy.x;\
+        output[dstOffset] = (TOut)(Func);\
+        x = (float)xx.y;\
+        y = (float)yy.y;\
+        output[dstOffset+1] = (TOut)(Func);\
+        xx = ((half2 *)(input0+srcOffset))[1];\
+        yy = ((half2 *)(input1+srcOffset1))[1];\
+        x = (float)xx.x;\
+        y = (float)yy.x;\
+        output[dstOffset+2] = (TOut)(Func);\
+        x = (float)xx.y;\
+        y = (float)yy.y;\
+        output[dstOffset+3] = (TOut)(Func);\
+    }\
 }\
 
 #define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0))
@@ -398,44 +677,107 @@ BINARY_FUNC(FLOORMOD, x - floor(x / y) * y);
 BINARY_FUNC(SquaredDifference, (x-y)*(x-y));
 BINARY_FUNC(POW, pow(x, y));
 BINARY_FUNC(ATAN2, atan2(x, y));
-BINARY_FUNC(MOD, x - x / y);
+BINARY_FUNC(MOD, (x % y));
 BINARY_FUNC(LOGICALOR, (x || y) ? 1 : 0);
 
-void BinaryBlitTemplateFloat(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
+BINARY_FUNC_FLOATMID(ADD, x+y);
+BINARY_FUNC_FLOATMID(SUB, x-y);
+BINARY_FUNC_FLOATMID(MUL, x*y);
+BINARY_FUNC_FLOATMID(DIV, x/y);
+BINARY_FUNC_FLOATMID(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_FUNC_FLOATMID(MINIMUM, min(x, y));
+BINARY_FUNC_FLOATMID(MAXIMUM, max(x, y));
+BINARY_FUNC_FLOATMID(GREATER, x > y ? 1 : 0);
+BINARY_FUNC_FLOATMID(LESS, x < y ? 1 : 0);
+BINARY_FUNC_FLOATMID(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_FUNC_FLOATMID(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_FUNC_FLOATMID(EQUAL, x == y ? 1 : 0);
+BINARY_FUNC_FLOATMID(NOTEQUAL, x != y ? 1 : 0);
+BINARY_FUNC_FLOATMID(FLOORDIV, floor(x / y));
+BINARY_FUNC_FLOATMID(FLOORMOD, x - floor(x / y) * y);
+BINARY_FUNC_FLOATMID(SquaredDifference, (x-y)*(x-y));
+BINARY_FUNC_FLOATMID(POW, pow(x, y));
+BINARY_FUNC_FLOATMID(ATAN2, atan2(x, y));
+BINARY_FUNC_FLOATMID(MOD, fmod(x, y));
+BINARY_FUNC_FLOATMID(LOGICALOR, (x || y) ? 1 : 0);
+
+BINARY_FUNC_FLOATMID4(ADD, x+y);
+BINARY_FUNC_FLOATMID4(SUB, x-y);
+BINARY_FUNC_FLOATMID4(MUL, x*y);
+BINARY_FUNC_FLOATMID4(DIV, x/y);
+BINARY_FUNC_FLOATMID4(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_FUNC_FLOATMID4(MINIMUM, min(x, y));
+BINARY_FUNC_FLOATMID4(MAXIMUM, max(x, y));
+BINARY_FUNC_FLOATMID4(GREATER, x > y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(LESS, x < y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(EQUAL, x == y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(NOTEQUAL, x != y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(FLOORDIV, floor(x / y));
+BINARY_FUNC_FLOATMID4(FLOORMOD, x - floor(x / y) * y);
+BINARY_FUNC_FLOATMID4(SquaredDifference, (x-y)*(x-y));
+BINARY_FUNC_FLOATMID4(POW, pow(x, y));
+BINARY_FUNC_FLOATMID4(ATAN2, atan2(x, y));
+BINARY_FUNC_FLOATMID4(MOD, fmod(x, y));
+BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0);
+
+template<typename T>
+void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
     int count = size[0] * size[1] * size[2];
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
-    // TODO: Support FP16
-    MNN_ASSERT(bytes==4);
     #define COMPUTE_FLOAT(TYPE, TOut)\
-    if (opType == MNN::BinaryOpOperation_##TYPE ) {\
-            Binary##TYPE<<<block_num, threads_num>>>((const float*)input, (const float*)(input1), (TOut*)output,\
-                size[0], size[1], size[2],\
-                srcStride[0], srcStride[1], srcStride[2],\
-                srcStride1[0], srcStride1[1], srcStride1[2],\
-                dstStride[0], dstStride[1], dstStride[2]);\
-        return;\
-    }\
+        if (opType == MNN::BinaryOpOperation_##TYPE ) {\
+            if (size[2] == count) {\
+                if(count % 4 == 0 && count > 16384 && srcStride[2] == 1 && srcStride1[2] == 1 && dstStride[2] == 1) {\
+                    block_num = runtime->blocks_num(count/4);\
+                    threads_num = runtime->threads_num();\
+                    if(bytes == 4) {\
+                        BinaryMidLinear4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                            count/4);\
+                    } else {\
+                        BinaryMidLinearHalf4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                            count/4);\
+                    }\
+                } else {\
+                    BinaryMidLinear##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                        size[2],\
+                        srcStride[2],\
+                        srcStride1[2],\
+                        dstStride[2]);\
+                }\
+            } else {\
+                BinaryMid##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                    size[0], size[1], size[2],\
+                    srcStride[0], srcStride[1], srcStride[2],\
+                    srcStride1[0], srcStride1[1], srcStride1[2],\
+                    dstStride[0], dstStride[1], dstStride[2]);\
+            }\
+            return;\
+        }\
 
-    COMPUTE_FLOAT(ADD, float);
-    COMPUTE_FLOAT(SUB, float);
-    COMPUTE_FLOAT(MUL, float);
-    COMPUTE_FLOAT(DIV, float);
-    COMPUTE_FLOAT(REALDIV, float);
-    COMPUTE_FLOAT(MINIMUM, float);
-    COMPUTE_FLOAT(MAXIMUM, float);
+    COMPUTE_FLOAT(ADD, T);
+    COMPUTE_FLOAT(SUB, T);
+    COMPUTE_FLOAT(MUL, T);
+    COMPUTE_FLOAT(DIV, T);
+    COMPUTE_FLOAT(REALDIV, T);
+    COMPUTE_FLOAT(MINIMUM, T);
+    COMPUTE_FLOAT(MAXIMUM, T);
     COMPUTE_FLOAT(GREATER, int);
     COMPUTE_FLOAT(LESS, int);
     COMPUTE_FLOAT(LESS_EQUAL, int);
     COMPUTE_FLOAT(GREATER_EQUAL, int);
     COMPUTE_FLOAT(EQUAL, int);
     COMPUTE_FLOAT(NOTEQUAL, int);
-    COMPUTE_FLOAT(FLOORDIV, float);
-    COMPUTE_FLOAT(FLOORMOD, float);
-    COMPUTE_FLOAT(POW, float);
-    COMPUTE_FLOAT(SquaredDifference, float);
-    COMPUTE_FLOAT(ATAN2, float);
-    COMPUTE_FLOAT(MOD, float);
+    COMPUTE_FLOAT(FLOORDIV, T);
+    COMPUTE_FLOAT(FLOORMOD, T);
+    COMPUTE_FLOAT(POW, T);
+    COMPUTE_FLOAT(SquaredDifference, T);
+    COMPUTE_FLOAT(ATAN2, T);
+    COMPUTE_FLOAT(MOD, T);
+
+    #undef COMPUTE_FLOAT
 }
 
 void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
@@ -472,12 +814,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_
 
 void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) {
     if (type.code == halide_type_float) {
-        BinaryBlitTemplateFloat(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        if (type.bits == 32) {
+            BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        } else if (type.bits == 16) {
+            BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        }
     } else if (type.code == halide_type_int) {
         BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
     }
 }
 
-
 }// namespace CUDA
 }// namespace MNN
diff --git a/source/backend/cuda/execution/Raster.cuh b/source/backend/cuda/execution/Raster.cuh
index 701aee72..b03be095 100644
--- a/source/backend/cuda/execution/Raster.cuh
+++ b/source/backend/cuda/execution/Raster.cuh
@@ -6,11 +6,22 @@ namespace MNN {
 namespace CUDA {
     void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime);
     void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime);
-    void PackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
-    void UnpackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
     void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime);
     void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType);
     void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType);
+
+    // Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst
+    struct FuseRegion {
+        int32_t size[3] = {1, 1, 1};
+        int32_t srcStride[3] = {0, 0, 0};
+        int32_t dstStride[3] = {0, 0, 0};
+        int fuseNumber = 0;
+    };
+    void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes);
+
 }
 }
 
diff --git a/source/backend/cuda/execution/RasterExecution.cpp b/source/backend/cuda/execution/RasterExecution.cpp
index 92fba702..91946914 100644
--- a/source/backend/cuda/execution/RasterExecution.cpp
+++ b/source/backend/cuda/execution/RasterExecution.cpp
@@ -2,35 +2,305 @@
 //  RasterExecution.cpp
 //  MNN
 //
-//  Created by MNN on 2020/07/30.
+//  Created by MNN on b'2020/04/02'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
 #include "RasterExecution.hpp"
-#include "Raster.cuh"
-#include "core/Concurrency.h"
 #include "core/OpCommonUtils.hpp"
+#include "core/BufferAllocator.hpp"
+#include "Raster.cuh"
+#include "Transpose.cuh"
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
 
-ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) {
+    batch = t->batch();
+    if (t->dimensions() == 4) {
+        channel = t->channel();
+        area = t->width() * t->height();
+    } else if (t->dimensions() == 3) {
+        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
+        if (format == MNN_DATA_FORMAT_NHWC) {
+            channel = t->length(2);
+            area    = t->length(1);
+        } else {
+            channel = t->length(1);
+            area    = t->length(2);
+        }
+    } else {
+        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
+        if (format == MNN_DATA_FORMAT_NHWC) {
+            for (int i = t->dimensions() - 1; i > 0; i--) {
+                int len = t->length(i);
+                if (len > 1) {
+                    if (channel == 1) {
+                        channel = len;
+                    } else {
+                        area *= len;
+                    }
+                }
+            }
+        } else {
+            for (int i = 1; i < t->dimensions(); i++) {
+                int len = t->length(i);
+                if (len > 1) {
+                    if (channel == 1) {
+                        channel = len;
+                    } else {
+                        area *= len;
+                    }
+                }
+            }
+        }
+    }
+}
+// Detect if the region is a transpose
+static bool _transpose(const Tensor::InsideDescribe::Region& region) {
+    int srcOne = -1, dstOne = -1;
+    for (int i = 0; i < 3; i++) {
+        if (region.src.stride[i] == 1 && region.size[i] != 1) {
+            if (srcOne >= 0 || region.size[i] < 4) {
+                return false;
+            }
+            srcOne = i;
+        }
+        if (region.dst.stride[i] == 1 && region.size[i] != 1) {
+            if (dstOne >= 0 || region.size[i] < 4) {
+                return false;
+            }
+            dstOne = i;
+        }
+    }
+    return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne;
+}
+
+static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
+    auto origin = region.origin;
+    auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
+    auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
+    if (srcFormat == dstFormat) {
+        return 0;
+    }
+    if (0 != region.src.offset || 0 != region.dst.offset) {
+        return 0;
+    }
+    int dstBatch = 1, dstChannel = 1, dstArea = 1,
+        srcBatch = 1, srcChannel = 1, srcArea = 1;
+    getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
+    getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
+    if (dstBatch != srcBatch) {
+        return 0;
+    }
+    if (dstChannel != srcChannel) {
+        return 0;
+    }
+    if (dstArea != srcArea) {
+        return 0;
+    }
+    auto totalSize = dstBatch * dstChannel * dstArea;
+    int srcSize = 1;
+    int dstSize = 1;
+    int res = 1;
+    for (int i=0; i<3; ++i) {
+        if (region.size[i] == 1) {
+            continue;
+        }
+        if (region.src.stride[i] != region.dst.stride[i]) {
+            if (dstArea == 1) {
+                // Batch / Channel transpose
+                return 0;
+            }
+            res = 2;
+        }
+        srcSize += (region.size[i] - 1) * region.src.stride[i];
+        dstSize += (region.size[i] - 1) * region.dst.stride[i];
+    }
+    if (srcSize != totalSize || dstSize != totalSize ) {
+        return 0;
+    }
+    // Check If it can be described as NHWC <-> NC4HW4 transpose
+    if (2 == res) {
+        int srcChannelStride;
+        int dstChannelStride;
+        int srcAreaStride;
+        int dstAreaStride;
+        if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) {
+            srcChannelStride = srcArea;
+            srcAreaStride = 1;
+            dstChannelStride = 1;
+            dstAreaStride = srcChannel;
+        } else {
+            srcChannelStride = 1;
+            srcAreaStride = srcChannel;
+            dstAreaStride = 1;
+            dstChannelStride = srcArea;
+        }
+        for (int i=0; i<3; ++i) {
+            if (region.size[i] == 1) {
+                continue;
+            }
+            if (region.size[i] == dstBatch) {
+                if (region.src.stride[i] != region.dst.stride[i]) {
+                    return 0;
+                }
+                continue;
+            }
+            if (region.size[i] == srcChannel) {
+                if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) {
+                    return 0;
+                }
+            }
+            if (region.size[i] == srcArea) {
+                if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) {
+                    return 0;
+                }
+            }
+        }
+        return 2;
+    }
+    return 1;
+}
+
+ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
-    auto input     = inputs[0];
-    auto output    = outputs[0];
-    auto des       = TensorUtils::getDescribe(input);
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto des = TensorUtils::getDescribe(input);
     auto outputDes = TensorUtils::getDescribe(output);
-    mNeedZero      = !TensorUtils::regionIsFull(input);
-    mTempInputCopy.clear();
-
+    mNeedZero = !TensorUtils::regionIsFull(input);
+    mZeroPoint = 0;
+    mTempInput.clear();
+    mFastBlit.clear();
     mFuseRaster.first = false;
-    if(des->regions.size() > 1) {
-        mFuseRaster.first = true;
-        mFuseRaster.second = des->regions.size();
-        auto& slice0 = des->regions[0];
-        for (int i = 1; i < des->regions.size(); ++i) {
+    mTempOutput = nullptr;
+    auto midFormat = MNN_DATA_FORMAT_NCHW;
+    mTempInputCopy.clear();
+    mOutputPtr = output;
+    mFast = false;
+    int pack = PACK_NUMBER;
+    // all_srcFormat == dstFormat == NC4HW4 : Fast Exe
+    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        mFast = true;
+        for (int i=0; i< des->regions.size(); ++i) {
             auto& slice = des->regions[i];
-            if (slice0.origin->deviceId() != slice.origin->deviceId()) {
+            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                mFast = false;
+                break;
+            }
+            if (!OpCommonUtils::canBlitFast(slice, output, pack, true)) {
+                mFast = false;
+                break;
+            }
+        }
+        if (mFast) {
+            for (int i=0; i< des->regions.size(); ++i) {
+                auto& slice = des->regions[i];
+                if (slice.origin == nullptr) {
+                    continue;
+                }
+                Tensor::InsideDescribe::Region newRegion;
+                OpCommonUtils::turnToPackRegion(slice, newRegion, output, pack, true);
+                mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
+            }
+            return NO_ERROR;
+        }
+    }
+    mSingleConvert = 0;
+    // srcNum == 1 && srcFormat != dstFormat : Single Convert
+    if (des->regions.size() == 1) {
+        mSingleConvert = _singleConvert(des->regions[0], output);
+        if (mSingleConvert > 0) {
+            return NO_ERROR;
+        }
+    }
+    // Acquire Buffer for temp output
+    // TODO: optimize it
+    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
+        mTempOutput.reset(new Tensor);
+        TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat);
+    }
+    if (nullptr != mTempOutput) {
+        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        mOutputPtr = mTempOutput.get();
+    }
+    // input is NC4HW4 add Convert
+    std::vector<Tensor*> forRelease;
+    for (int i=0; i< des->regions.size(); ++i) {
+        auto& slice = des->regions[i];
+        auto origin = slice.origin;
+        if (slice.mask != 0) {
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+            continue;
+        }
+        // if tensor is not NC4HW4 or has been merged, don't need deal
+        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+            continue;
+        }
+        // if NC4HW4's C%4 == 0, change convert to transpose and fuse it
+        if (origin->batch() == 1 && origin->channel() % pack == 0) {
+            int channel = origin->channel();
+            int area = 1;
+            // conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height
+            for (int d = 2; d < origin->dimensions(); d++) {
+                area *= origin->length(d);
+            }
+            Tensor::InsideDescribe::Region regionTmp;
+            regionTmp.src.offset = 0;
+            regionTmp.src.stride[0] = area * pack;
+            regionTmp.src.stride[1] = 1;
+            regionTmp.src.stride[2] = pack;
+            regionTmp.dst.offset = 0;
+            regionTmp.dst.stride[0] = area * pack;
+            regionTmp.dst.stride[1] = area;
+            regionTmp.dst.stride[2] = 1;
+            regionTmp.size[0] = channel / pack;
+            regionTmp.size[1] = pack;
+            regionTmp.size[2] = area;
+            regionTmp.origin = slice.origin;
+            bool merge = TensorUtils::fuseRegion(regionTmp, slice);
+            if (merge) {
+                // cache the merged tensor
+                slice.mask = 1;
+                mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+                continue;
+            }
+        }
+        auto cache = static_cast<CUDABackend*>(backend())->getCache();
+        auto tempTensor = cache->findCacheTensor(origin, midFormat);
+        if (nullptr == tempTensor) {
+            std::shared_ptr<Tensor> newTensor(new Tensor);
+            TensorUtils::copyShape(origin, newTensor.get());
+            TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
+            newTensor->buffer().type = origin->getType();
+            TensorUtils::setLinearLayout(newTensor.get());
+            mTempInput.insert(std::make_pair(origin, newTensor.get()));
+            auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+            tempTensor = newTensor.get();
+            TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
+            cache->pushCacheTensor(newTensor, origin, midFormat);
+        }
+        if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
+            forRelease.emplace_back(tempTensor);
+        }
+        mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
+    }
+    if(mTempInputCopy.size() > 1) {
+        mFuseRaster.first = true;
+        mFuseRaster.second = mTempInputCopy.size();
+        auto& slice0 = *mTempInputCopy[0].second;
+        for (int i = 1; i < mTempInputCopy.size(); ++i) {
+            auto& slice = *mTempInputCopy[i].second;
+            if (mTempInputCopy[i].first != mTempInputCopy[0].first) {
                 mFuseRaster.first = false;
                 break;
             }
@@ -52,81 +322,141 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const st
             }
         }
     }
-    //mFuseRaster.first = false;
-    if(!mFuseRaster.first) {
-        for (int i = 0; i < des->regions.size(); ++i) {
-            auto& slice = des->regions[i];
-            if (nullptr == slice.origin) {
-                continue;
-            }
-            mTempInputCopy.emplace_back(std::make_pair((void*)slice.origin->deviceId(), &slice));
-        }
-    } else {
-        auto& slice0 = des->regions[0];
-        if (nullptr != slice0.origin) {
-            mTempInputCopy.emplace_back(std::make_pair((void*)slice0.origin->deviceId(), &slice0));
-        }
-
-        int regionSize = des->regions.size();
+    if(mFuseRaster.first) {
+        auto& slice0 = *mTempInputCopy[0].second;
+        auto tensor = mTempInputCopy[0].first;
+        int regionSize = mTempInputCopy.size();
         std::vector<int32_t> temp(2*regionSize, 0);
         for (int i = 0; i < regionSize; ++i) {
-            auto& slice = des->regions[i];
+            auto& slice = *mTempInputCopy[i].second;
             temp[i] = slice.src.offset;
             temp[regionSize+i] = slice.dst.offset;
-            //printf("%d-", tmpSrc[i]);
+            //printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]);
         }
         //save srcOffset/dstOffset to Device
         offsetTensor.reset(Tensor::createDevice<int32_t>({2*regionSize}));
         backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC);
         mOffset = (void *)offsetTensor.get()->buffer().device;
         cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice));
+        mTempInputCopy.clear();
+        mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0));
+    }
+
+    for (auto t : forRelease) {
+        backend()->onReleaseBuffer(t, Backend::DYNAMIC);
+    }
+    if (nullptr != mTempOutput) {
+        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
     }
     return NO_ERROR;
 }
 
-ErrorCode RasterExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+void RasterExecution::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
+    auto bn = static_cast<CUDABackend*>(backend());
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto bytes = bn->getBytes(output);
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    auto input   = inputs[0];
-    auto output  = outputs[0];
-    auto bytes   = input->getType().bytes();
     if (mNeedZero) {
-        runtime->memset((void*)output->deviceId(), 0, output->size());
+        auto size = static_cast<CUDABackend*>(backend())->realSize(output) * bytes;
+        cudaMemset((uint8_t*)output->deviceId(), 0, size);
+    }
+    // Use mFastBlit
+    for (auto& iter : mFastBlit) {
+        auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes;
+        auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes;
+        RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes * PACK_NUMBER, runtime);
+    }
+}
+
+
+ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    if (mFast) {
+        executeFaster(inputs, outputs);
+        return NO_ERROR;
+    }
+    auto bn = static_cast<CUDABackend*>(backend());
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto bytes = bn->getBytes(output);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    if (mSingleConvert > 0) {
+        auto realInput = TensorUtils::getDescribe(input)->regions[0].origin;
+        int srcBatch = 1, srcChannel = 1, srcArea = 1;
+        getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea);
+        auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat;
+        auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat;
+        int batchStride = srcChannel * srcArea * bytes;
+        int inputBatchStride = batchStride;
+        int outputBatchStride = batchStride;
+        PackInfo pack;
+        pack.inside = srcArea;
+        pack.axis = srcChannel;
+        pack.unit = PACK_NUMBER;
+        pack.outside = srcBatch;
+        if (mSingleConvert == 1) {
+            pack.axisStride = srcArea;
+            pack.insideStride = 1;
+        } else if (mSingleConvert == 2) {
+            pack.axisStride = 1;
+            pack.insideStride = srcChannel;
+        }
+        auto srcPtr = (void*)realInput->deviceId();
+        auto dstPtr = (void*)output->deviceId();
+        if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) {
+            if (realInput->dimensions() <= 1) {
+                cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
+                return NO_ERROR;
+            }
+            UnpackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);            
+        } else {
+            if (output->dimensions() <= 1) {
+                cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
+                return NO_ERROR;
+            }
+            PackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);            
+        }
+        return NO_ERROR;
+    }
+    if (mNeedZero) {
+        auto size = static_cast<CUDABackend*>(backend())->realSize(mOutputPtr) * bytes;
+        cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size);
+    }
+    for (auto& iter : mTempInput) {
+        backend()->onCopyBuffer(iter.first, iter.second);
     }
     if(mFuseRaster.first) {
         MNN_ASSERT(mTempInputCopy.size() == 1);
         auto& iter  = mTempInputCopy[0];
         auto& slice = *(iter.second);
-        auto srcPtr = (uint8_t*)iter.first;
-        auto dstPtr = (uint8_t*)output->deviceId();
+        auto srcPtr = (uint8_t*)iter.first->deviceId();
+        auto dstPtr = (uint8_t*)mOutputPtr->deviceId();
         //printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset);
 
         FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime);
-        return NO_ERROR;
+    } else {
+        for (auto& iter : mTempInputCopy) {
+            auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second->src.offset * bytes;
+            auto dstPtr = (uint8_t*)mOutputPtr->deviceId() + iter.second->dst.offset * bytes;
+            RasterBlit(dstPtr, srcPtr, iter.second->size, iter.second->src.stride, iter.second->dst.stride, bytes, runtime);
+        }
     }
-    for (int u = 0; u < mTempInputCopy.size(); ++u) {
-        auto& iter  = mTempInputCopy[u];
-        auto& slice = *(iter.second);
-        auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
-        auto dstPtr = (uint8_t*)output->deviceId() + slice.dst.offset * bytes;
-        RasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, bytes, runtime);
+
+    if (nullptr != mTempOutput) {
+        backend()->onCopyBuffer(mTempOutput.get(), output);
     }
     return NO_ERROR;
 }
 
-RasterExecution::RasterExecution(Backend* backend) : Execution(backend) {
-    // Do nothing
-}
-RasterExecution::~RasterExecution() {
-    // Do nothing
-}
-class RasterCreator : public CUDABackend::Creator {
+class RasterExecutionFactory : public CUDABackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
+                                const MNN::Op* op, Backend* backend) const {
         return new RasterExecution(backend);
     }
 };
 
-static CUDACreatorRegister<RasterCreator> __init(OpType_Raster);
-} // namespace CUDA
-} // namespace MNN
\ No newline at end of file
+static CUDACreatorRegister<RasterExecutionFactory> __init(OpType_Raster);
+
+}
+}
\ No newline at end of file
diff --git a/source/backend/cuda/execution/RasterExecution.hpp b/source/backend/cuda/execution/RasterExecution.hpp
index 5ef27c49..ed464b40 100644
--- a/source/backend/cuda/execution/RasterExecution.hpp
+++ b/source/backend/cuda/execution/RasterExecution.hpp
@@ -2,37 +2,43 @@
 //  RasterExecution.hpp
 //  MNN
 //
-//  Created by MNN on 2020/07/30.
+//  Created by MNN on b'2020/04/02'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-
 #ifndef RasterExecution_hpp
 #define RasterExecution_hpp
-#include <map>
-#include <memory>
-#include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
-#include "core/Execution.hpp"
+#include <map>
+#include <set>
 #include "core/TensorUtils.hpp"
-
 namespace MNN {
 namespace CUDA {
 class RasterExecution : public Execution {
 public:
-    RasterExecution(Backend *backend);
-    virtual ~RasterExecution();
+    RasterExecution(Backend* bn) : Execution(bn) {
+        // Do nothing
+    }
+    virtual ~ RasterExecution() {
+        // Do nothing
+    }
+    
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
+    void executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const;
 private:
-    std::vector<std::pair<void *, Tensor::InsideDescribe::Region *>> mTempInputCopy;
+    std::map<Tensor*, Tensor*> mTempInput;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
+    std::shared_ptr<Tensor> mTempOutput;
+    Tensor* mOutputPtr;
     bool mNeedZero = false;
+    bool mFast = false;
+    int mSingleConvert = 0;
+    int32_t mZeroPoint = 0;
     std::pair<bool, int> mFuseRaster;
-
     void *mOffset;
     std::shared_ptr<Tensor> offsetTensor;
 };
-} // namespace CUDA
-} // namespace MNN
-
+}
+}
 #endif
diff --git a/source/backend/cuda/execution/ReductionExecution.cu b/source/backend/cuda/execution/ReductionExecution.cu
index 6d895a70..75ffb0fa 100755
--- a/source/backend/cuda/execution/ReductionExecution.cu
+++ b/source/backend/cuda/execution/ReductionExecution.cu
@@ -1,99 +1,19 @@
 #include "ReductionExecution.hpp"
-
 namespace MNN {
 namespace CUDA {
 
 ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
     mType = opType;
     mAxis = axis;
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mParam = staticPool->alloc(sizeof(ReduceParam));
 }
 ReductionExecution::~ ReductionExecution() {
-    // Do nothing
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mParam);
 }
 
-template <typename T>
-__global__ void SUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        T sumValue = (T)0;
-        const T* basicInput = input + y * axis * inside + x;
-        for (int v=0; v<axis; ++v) {
-            sumValue += basicInput[v * inside];
-        }
-        output[y * inside + x] = sumValue;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MEAN(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        T sumValue = (T)0;
-        const T* basicInput = input + y * axis * inside + x;
-        for (int v=0; v<axis; ++v) {
-            sumValue += basicInput[v * inside];
-        }
-        output[y * inside + x] = sumValue / (T)axis;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MINIMUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res = min(basicInput[v * inside], res);
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MAXIMUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res = max(basicInput[v * inside], res);
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void PROD(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res *= basicInput[v * inside];
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input = (void*)inputs[0]->deviceId();
-    auto output = (void*)outputs[0]->deviceId();
+ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
     int inside = 1;
     int outside = 1;
@@ -104,52 +24,88 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
     for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
         inside *= inputs[0]->length(i);
     }
+    mCpuParam.inside = inside;
+    mCpuParam.outside = outside;
+    mCpuParam.axis = axis;
+    cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
+
+    return NO_ERROR;
+}
+
+ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input = (void*)inputs[0]->deviceId();
+    auto output = (void*)outputs[0]->deviceId();
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    int inside = mCpuParam.inside;;
+    int outside = mCpuParam.outside;
     int count = inside * outside;
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
+    auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
     if (inputs[0]->getType() == halide_type_of<float>()) {
-        switch (mType) {
-            case ReductionType_MEAN:
-                MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_SUM:
-                SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_MINIMUM:
-                MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_MAXIMUM:
-                MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_PROD:
-                PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
+        if (static_cast<CUDABackend*>(backend())->useFp16()) {
+            switch (mType) {
+                case ReductionType_MEAN:
+                    MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_SUM:
+                    SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MINIMUM:
+                    MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MAXIMUM:
+                    MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_PROD:
+                    PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+            }
+        } else {
+            switch (mType) {
+                case ReductionType_MEAN:
+                    MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_SUM:
+                    SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MINIMUM:
+                    MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MAXIMUM:
+                    MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_PROD:
+                    PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+            }
         }
         MNN_ASSERT(false);
         return NOT_SUPPORT;
     }
+    
     MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
     switch (mType) {
         case ReductionType_MEAN:
-            MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_SUM:
-            SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_MINIMUM:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_MAXIMUM:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_PROD:
-            PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_ANY:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
         case ReductionType_ALL:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
             return NO_ERROR;
     }
     MNN_ASSERT(false);
diff --git a/source/backend/cuda/execution/ReductionExecution.hpp b/source/backend/cuda/execution/ReductionExecution.hpp
index a9699de7..a281e9ee 100644
--- a/source/backend/cuda/execution/ReductionExecution.hpp
+++ b/source/backend/cuda/execution/ReductionExecution.hpp
@@ -11,6 +11,7 @@
 #include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
+#include "ReductionTemplate.cuh"
 namespace MNN {
 namespace CUDA {
 class ReductionExecution : public Execution {
@@ -18,10 +19,13 @@ public:
     ReductionExecution(ReductionType opType, int axis, Backend *backend);
     virtual ~ReductionExecution();
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 private:
     ReductionType mType;
     int mAxis;
+    ReduceParam mCpuParam;
+    std::pair<void*, int> mParam;
 };
 } // namespace CUDA
 } // namespace MNN
diff --git a/source/backend/cuda/execution/ReductionTemplate.cuh b/source/backend/cuda/execution/ReductionTemplate.cuh
new file mode 100644
index 00000000..3586e83c
--- /dev/null
+++ b/source/backend/cuda/execution/ReductionTemplate.cuh
@@ -0,0 +1,93 @@
+#ifndef ReductionTemplate_cuh
+#define ReductionTemplate_cuh
+struct ReduceParam {
+    int inside;
+    int axis;
+    int outside;
+};
+template <typename T>
+__global__ void SUM(const T *input, T *output, const ReduceParam* param) {
+    int count = param->inside * param->outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / param->inside;
+        int x = i % param->inside;
+        float sumValue = 0.0;
+        int axis = param->axis;
+        const T* basicInput = input + y * param->axis * param->inside + x;
+        for (int v=0; v<axis; ++v) {
+            sumValue += (float)basicInput[v * param->inside];
+        }
+        output[y * param->inside + x] = (T)sumValue;
+    }
+    return;
+}
+
+template <typename T>
+__global__ void MEAN(const T *input, T *output, const ReduceParam* param) {
+    int count = param->inside * param->outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / param->inside;
+        int x = i % param->inside;
+        float sumValue = 0.0;
+        int axis = param->axis;
+        const T* basicInput = input + y * param->axis * param->inside + x;
+        for (int v=0; v<axis; ++v) {
+            sumValue += (float)basicInput[v * param->inside];
+        }
+        output[y * param->inside + x] = (T)(sumValue / (float)param->axis);
+    }
+    return;
+}
+
+template <typename T>
+__global__ void MINIMUM(const T *input, T *output, const ReduceParam* param) {
+    int count = param->inside * param->outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / param->inside;
+        int x = i % param->inside;
+        int axis = param->axis;
+        const T* basicInput = input + y * param->axis * param->inside + x;
+        float res = (float)basicInput[0];
+        for (int v=1; v<axis; ++v) {
+            res = min((float)basicInput[v * param->inside], res);
+        }
+        output[y * param->inside + x] = (T)res;
+    }
+    return;
+}
+
+template <typename T>
+__global__ void MAXIMUM(const T *input, T *output, const ReduceParam* param) {
+    int count = param->inside * param->outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / param->inside;
+        int x = i % param->inside;
+        const T* basicInput = input + y * param->axis * param->inside + x;
+        int axis = param->axis;
+        float res = (float)basicInput[0];
+        for (int v=1; v<axis; ++v) {
+            res = max((float)basicInput[v * param->inside], res);
+        }
+        output[y * param->inside + x] = (T)res;
+    }
+    return;
+}
+
+template <typename T>
+__global__ void PROD(const T *input, T *output, const ReduceParam* param) {
+    int count = param->inside * param->outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / param->inside;
+        int x = i % param->inside;
+        int axis = param->axis;
+        float sumValue = 1.0;
+        const T* basicInput = input + y * param->axis * param->inside + x;
+        for (int v=0; v<axis; ++v) {
+            sumValue *= (float)basicInput[v * param->inside];
+        }
+        output[y * param->inside + x] = (T)sumValue;
+    }
+    return;
+}
+
+#endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/ScaleExecution.cu b/source/backend/cuda/execution/ScaleExecution.cu
index e90ba55d..a0eb25a0 100644
--- a/source/backend/cuda/execution/ScaleExecution.cu
+++ b/source/backend/cuda/execution/ScaleExecution.cu
@@ -1,4 +1,5 @@
 #include "ScaleExecution.hpp"
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
 
@@ -6,61 +7,50 @@ namespace CUDA {
 
 template<typename T>
 __global__ void SCALE(const int n, const int channels, const int dim, const T* in, T* out,
-                        const T* scaleData, const T* biasData) {
-    CUDA_KERNEL_LOOP(index, n) {
-        int c      = (index / dim) % channels;
-        out[index] = in[index] * scaleData[c] + biasData[c];
+                        const float* scaleData, const float* biasData) {
+    CUDA_KERNEL_LOOP(count, n) {
+        int index  = count / PACK_NUMBER;
+        int r      = count % PACK_NUMBER;
+        int c      = (index / dim) * PACK_NUMBER + r;
+        out[count] = (T)((float)in[count] * scaleData[c] + biasData[c]);
     }
 }
 
 ScaleExecution::ScaleExecution(const Scale* scale, Backend *backend) : Execution(backend) {
-    mChannel   = scale->scaleData()->size();
-
-    scaleTensor.reset(Tensor::createDevice<float>({mChannel}));
-    backend->onAcquireBuffer(scaleTensor.get(), Backend::STATIC);
-    mDeviceScale = (void *)scaleTensor.get()->buffer().device;
-
-    biasTensor.reset(Tensor::createDevice<float>({mChannel}));
-    backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-    mDeviceBias = (void *)biasTensor.get()->buffer().device;
-    
-    MNN_ASSERT(nullptr != mDeviceScale);
-    MNN_ASSERT(nullptr != mDeviceBias);
+    int channel   = scale->scaleData()->size();
+    mChannel = UP_DIV(channel, PACK_NUMBER);
+    auto scaleBiasStorageSize = 2 * mChannel * PACK_NUMBER * sizeof(float);
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mScaleBiasStorage = staticPool->alloc(scaleBiasStorageSize);
+    mDeviceScale = (uint8_t*)mScaleBiasStorage.first + mScaleBiasStorage.second;
+    mDeviceBias = (uint8_t*)mDeviceScale + scaleBiasStorageSize / 2;
+    cudaMemset(mDeviceScale, 0, scaleBiasStorageSize);
     {
         auto alphaData = scale->scaleData()->data();
-        cudaMemcpy(mDeviceScale, alphaData, mChannel * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(mDeviceScale, alphaData, channel * sizeof(float), cudaMemcpyHostToDevice);
     }
     {
         auto biasData = scale->biasData()->data();
         if (nullptr != biasData) {
-            MNN_ASSERT(mChannel == scale->biasData()->size());
-            cudaMemcpy(mDeviceBias, biasData, mChannel * sizeof(float), cudaMemcpyHostToDevice);
-        } else {
-            cudaMemset(mDeviceBias, 0, mChannel * sizeof(float));
+            cudaMemcpy(mDeviceBias, biasData, channel * sizeof(float), cudaMemcpyHostToDevice);
         }
     }
 }
 ScaleExecution::~ScaleExecution() {
-    if (nullptr != scaleTensor) {
-        backend()->onReleaseBuffer(scaleTensor.get(), Backend::STATIC);
-    }
-    if (nullptr != biasTensor) {
-        backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC);
-    }
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mScaleBiasStorage);
 }
 
 ErrorCode ScaleExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(inputs.size() == 1);
     MNN_ASSERT(outputs.size() == 1);
     auto input = inputs[0];
-    mBatch     = input->length(0);
-    MNN_ASSERT(mChannel == input->length(1));
     MNN_ASSERT(input->dimensions() >= 2);
-    mArea      = 1;
+    mArea      = input->length(0);
     for (int i = 2; i < input->dimensions(); ++i) {
         mArea *= input->length(i);
     }
-    mCount = mBatch*mChannel*mArea;
+    mCount = mChannel*mArea*PACK_NUMBER;
     //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount);
     return NO_ERROR;
 }
@@ -72,9 +62,13 @@ ErrorCode ScaleExecution::onExecute(const std::vector<Tensor *> &inputs, const s
     int threads_num = runtime->threads_num();
     auto input_addr = (void*)inputs[0]->deviceId();
     auto output_addr = (void*)outputs[0]->deviceId();
-
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        SCALE<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr,
+            (const float *)mDeviceScale, (const float *)mDeviceBias);
+        return NO_ERROR;
+    }
     SCALE<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
-    (const float *)mDeviceScale, (const float *)mDeviceBias);
+        (const float *)mDeviceScale, (const float *)mDeviceBias);
     return NO_ERROR;
 }
 
diff --git a/source/backend/cuda/execution/ScaleExecution.hpp b/source/backend/cuda/execution/ScaleExecution.hpp
index 6b0168b2..f9bd829a 100644
--- a/source/backend/cuda/execution/ScaleExecution.hpp
+++ b/source/backend/cuda/execution/ScaleExecution.hpp
@@ -30,13 +30,9 @@ private:
     void *mDeviceBias = nullptr;
     void *mDeviceScale = nullptr;
     int mCount;
-    int mBatch;
     int mChannel;
     int mArea;
-
-    std::shared_ptr<Tensor> scaleTensor;
-    std::shared_ptr<Tensor> biasTensor;
-
+    std::pair<void*, int> mScaleBiasStorage;
 };
 
 } // namespace CUDA
diff --git a/source/backend/cuda/execution/SelectExecution.cu b/source/backend/cuda/execution/SelectExecution.cu
index daa03687..6e5e47e5 100644
--- a/source/backend/cuda/execution/SelectExecution.cu
+++ b/source/backend/cuda/execution/SelectExecution.cu
@@ -41,8 +41,11 @@ ErrorCode SelectExecution::onExecute(const std::vector<Tensor*>& inputs, const s
     auto count = CUDABackend::realSize(inputs[0]);
     int block_num = runtime->blocks_num(count);
     int threads_num = runtime->threads_num();
-    SELECT<<<block_num, threads_num>>>(count, (const int*)(inputs[0]->deviceId()), (const int*)(inputs[1]->deviceId()), (const int*)(inputs[2]->deviceId()), (int*)outputs[0]->deviceId());
-
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        SELECT<<<block_num, threads_num>>>(count, (const int*)(inputs[0]->deviceId()), (const half*)(inputs[1]->deviceId()), (const half*)(inputs[2]->deviceId()), (half*)outputs[0]->deviceId());
+    } else {
+        SELECT<<<block_num, threads_num>>>(count, (const int*)(inputs[0]->deviceId()), (const float*)(inputs[1]->deviceId()), (const float*)(inputs[2]->deviceId()), (float*)outputs[0]->deviceId());
+    }
 #ifdef LOG_VERBOSE
     MNN_PRINT("end SelectExecution onExecute...");
 #endif
diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu
index b57957ac..e55149ef 100644
--- a/source/backend/cuda/execution/SoftmaxExecution.cu
+++ b/source/backend/cuda/execution/SoftmaxExecution.cu
@@ -1,44 +1,120 @@
 #include "SoftmaxExecution.hpp"
-
+#include "core/TensorUtils.hpp"
 namespace MNN {
 namespace CUDA {
 
+template <typename T>
+__global__ void SOFTMAX(const T *input, T *output, const ReduceParam* param) {
+    int inside = param->inside;
+    int axis = param->axis;
+    int outside = param->outside;
+    int count = inside * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int y = i / inside;
+        int x = i % inside;
+        const T* src = input + y * axis * inside + x;
+        T* dst = output + y * axis * inside + x;
+        float maxValue = (float)src[0];
+        for (int z=1; z<axis; ++z) {
+            maxValue = max(maxValue, src[z * inside]);
+        }
+        float sumValue = 0.0;
+        for (int z=0; z<axis; ++z) {
+            sumValue = sumValue + exp((float)src[z * inside] - maxValue);
+        }
+        sumValue = 1.0 / sumValue;
+        for (int z=0; z<axis; ++z) {
+            dst[z*inside] = (T)(exp((float)src[z * inside] - maxValue) * sumValue);
+        }
+    }
+}
+
+template <typename T>
+__global__ void EXPSUB(const T *input, const T* maxV, T *output, const ReduceParam* param) {
+    int inside = param->inside;
+    int axis = param->axis;
+    int outside = param->outside;
+    int count = inside * axis * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int tmp = i / inside;
+        int x = i % inside;
+        int y = tmp / axis;
+        int c = tmp % axis;
+        float sumValue = 0.0;
+        const float basicInput = input[i];
+        const float maxValue = maxV[x + y * inside];
+        output[i] = (T)(exp(basicInput - maxValue));
+    }
+    return;
+}
+
+template <typename T>
+__global__ void DIVSUM(const T *input, const T* maxV, T *output, const ReduceParam* param) {
+    int inside = param->inside;
+    int axis = param->axis;
+    int outside = param->outside;
+    int count = inside * axis * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+        int tmp = i / inside;
+        int x = i % inside;
+        int y = tmp / axis;
+        int c = tmp % axis;
+        float sumValue = 0.0;
+        const float basicInput = input[i];
+        const float value = maxV[x + y * inside];
+        output[i] = (T)(basicInput / value);
+    }
+    return;
+}
 SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
-    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
-    cudnn_handle_ = runtime->cudnn_handle();
-
-    cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
-
-    cudnn_data_type_ = CUDNN_DATA_FLOAT;
     mAxis = axis;
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mParam = staticPool->alloc(sizeof(ReduceParam));
 }
 
 SoftmaxExecution::~SoftmaxExecution() {
-    cudnnDestroyTensorDescriptor(input_desc_);
-    cudnnDestroyTensorDescriptor(output_desc_);
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mParam);
 }
 
 ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    inside = 1;
-    outside = 1;
-    if(mAxis < 0) {
-        mAxis += inputs[0]->dimensions();
-    }
-    axis = inputs[0]->length(mAxis);
-    for (int i=0; i<mAxis; ++i) {
-        outside *= inputs[0]->length(i);
-    }
-    for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
-        inside *= inputs[0]->length(i);
+    auto input           = inputs[0];
+    const int dimensions = input->buffer().dimensions;
+    int axis = mAxis;
+    if (axis < 0) {
+        axis += dimensions;
     }
 
-    std::vector<int> tensor_shape = {outside, axis, inside, 1};
-    cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
-                                tensor_shape[1], tensor_shape[2], tensor_shape[3]));
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+    if (mNeedUnpackC4) {
+        for (int i=0; i < dimensions; ++i) {
+            mStorage.buffer().dim[i].extent = input->length(i);            
+        }
+        TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+        mStorage.buffer().dimensions    = dimensions;
+        mStorage.buffer().type          = input->getType();
+        backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
+    }
 
-    cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, tensor_shape[0],
-                                tensor_shape[1], tensor_shape[2], tensor_shape[3]));
+    int inside = 1;
+    int outside = 1;
+    int dims   = input->buffer().dimensions;
+    for (int i = 0; i < axis; ++i) {
+        outside *= input->length(i);
+    }
+    for (int i = axis + 1; i < dims; ++i) {
+        inside *= input->length(i);
+    }
+
+    if (mNeedUnpackC4) {
+        backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
+    }
+
+    mCpuParam.inside = inside;
+    mCpuParam.outside = outside;
+    mCpuParam.axis = input->length(axis);
+    cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
 
     return NO_ERROR;
 }
@@ -46,15 +122,28 @@ ErrorCode SoftmaxExecution::onResize(const std::vector<Tensor *> &inputs, const
 ErrorCode SoftmaxExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input = (void*)inputs[0]->deviceId();
     auto output = (void*)outputs[0]->deviceId();
-
-    const float alpha = 1;
-    const float beta = 0;
-    cudnn_check(cudnnSoftmaxForward(cudnn_handle_, CUDNN_SOFTMAX_ACCURATE,
-                CUDNN_SOFTMAX_MODE_CHANNEL,
-                &alpha,
-                input_desc_, input,
-                &beta,
-                output_desc_, output));
+    auto dst = output;
+    auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
+    if (mNeedUnpackC4) {
+        backend()->onCopyBuffer(inputs[0], &mStorage);
+        input = (void*)mStorage.deviceId();
+        dst = (void*)mStorage.deviceId();
+    }
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    int inside = mCpuParam.inside;
+    int outside = mCpuParam.outside;
+    int axis = mCpuParam.axis;
+    int count = inside * outside;
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        SOFTMAX<<<block_num, threads_num>>>((const half*)input, (half*)dst, param);
+    } else {
+        SOFTMAX<<<block_num, threads_num>>>((const float*)input, (float*)dst, param);
+    }
+    if (mNeedUnpackC4) {
+        backend()->onCopyBuffer(&mStorage, outputs[0]);
+    }
 
     return NO_ERROR;
 }
diff --git a/source/backend/cuda/execution/SoftmaxExecution.hpp b/source/backend/cuda/execution/SoftmaxExecution.hpp
index df0661d7..40876d44 100644
--- a/source/backend/cuda/execution/SoftmaxExecution.hpp
+++ b/source/backend/cuda/execution/SoftmaxExecution.hpp
@@ -9,11 +9,9 @@
 #ifndef SoftmaxExecution_hpp
 #define SoftmaxExecution_hpp
 
-#include "core/Execution.hpp"
-
 #include <vector>
+#include "ReductionTemplate.cuh"
 #include "backend/cuda/core/CUDABackend.hpp"
-
 namespace MNN {
 namespace CUDA {
 
@@ -26,15 +24,11 @@ public:
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 private:
-    cudnnHandle_t cudnn_handle_;
-    cudnnTensorDescriptor_t input_desc_;
-    cudnnTensorDescriptor_t output_desc_;
-    cudnnDataType_t cudnn_data_type_;
-    
     int mAxis;
-    int axis;
-    int inside;
-    int outside;
+    Tensor mStorage;
+    bool mNeedUnpackC4;
+    ReduceParam mCpuParam;
+    std::pair<void*, int> mParam;
 };
 
 } // namespace CUDA
diff --git a/source/backend/cuda/execution/TensorCoreGemm.cu b/source/backend/cuda/execution/TensorCoreGemm.cu
index 4974e3a4..4b167670 100644
--- a/source/backend/cuda/execution/TensorCoreGemm.cu
+++ b/source/backend/cuda/execution/TensorCoreGemm.cu
@@ -3,6 +3,7 @@
 #include <mma.h>
 #include <cuda_runtime_api.h>
 #include "TensorCoreGemm.cuh"
+#include "MNNCUDAFunction.cuh"
 #define BLOCK_ROW_WARPS 2
 #define BLOCK_COL_WARPS 4
 
@@ -12,127 +13,237 @@
 #define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
 #define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
 
-#define CHUNK_K 4
+#define CHUNK_L 4
+#define CHUNK_E 4
+#define CHUNK_H 4
+#define PACK_NUMBER 16
+#define PACK_NUMBER_C2 (PACK_NUMBER/2)
 
 using namespace nvcuda;
 namespace MNN {
 namespace CUDA {
 
-__global__ void GemmPrearrange(const MatMulParam* param,
-        const float* A,
-        __half* AP,
-        const float* B,
-        __half* BP
+template<typename T>
+__global__ void GemmPrearrange(MatMulParam paramV,
+        const T* OA,
+        __half* OAP,
+        const T* OB,
+        __half* OBP,
+        DivModFast lA
         ) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int b = blockIdx.x;
+    auto param = &paramV;
+    int lAlign = param->elhPack[1] * 16;
+    int eAlign = param->elhPack[0] * 16;
+    int hAlign = param->elhPack[2] * 16;
+    __half* BP = OBP + b * param->elhPack[1] * param->elhPack[2] * 16 * 16;
+    __half* AP = OAP + b * param->elhPack[1] * param->elhPack[0] * 16 * 16;
+    const T* A = OA + b * param->elh[0] * param->elh[1];
+    const T* B = OB + b * param->elh[2] * param->elh[1];
+    int mc = param->elhPack[0] * param->elhPack[1] * 256;
     int e = param->elh[0];
     int l = param->elh[1];
     int h = param->elh[2];
-    int lIndex = i % l;
-    int oIndex = i / l;
-    int lU = lIndex / 16;
-    int lR = lIndex % 16;
-    int eU = oIndex / 16;
-    int eR = oIndex % 16;
+    for (size_t index = threadIdx.x; index < mc && OA != nullptr; index += blockDim.x) {
+        int lIndex, oIndex;
+        lA.divmod(index, oIndex, lIndex);
 
-    if (i < e * l) {
-        float value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]];
-        __half* dst = AP + eU * param->elhPack[1] * 16 * 16 + lU * 16 * 16 + lR + eR * 16;
-        dst[0] = value;
+        half value = 0.0;
+        if (oIndex < e && lIndex < l) {
+            value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]];
+        }
+        AP[index] = value;
     }
-    if (i < h * l) {
-        float value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]];
-        int hU = eU;
-        int hR = eR;
-        __half* dst = BP + hU * param->elhPack[1] * 16 * 16 + lU * 16 * 16 + lR + hR * 16;
-        dst[0] = value;
+    mc = param->elhPack[2] * param->elhPack[1] * 256;
+    for (size_t index = threadIdx.x; index < mc && OB != nullptr; index += blockDim.x) {
+        int lIndex, oIndex;
+        lA.divmod(index, oIndex, lIndex);
+        half value = 0.0;
+        if (oIndex < h && lIndex < l) {
+            value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]];
+        }
+        BP[index] = value;
     }
 }
 
-void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const float* A, __half* AP, const float* B, __half* BP) {
-    int maxCount = std::max(cpuParam->elh[0] * cpuParam->elh[1], cpuParam->elh[1] * cpuParam->elh[2]);
+template<typename T>
+__global__ void GemmPrearrange_OPT(MatMulParam paramV, const int maxCount,
+        const int AreaPackA, const int AreaPackB, const int AreaA, const int AreaB,
+        const T* OA,
+        __half* OAP,
+        const T* OB,
+        __half* OBP,
+        DivModFast lA,
+        DivModFast pM
+        ) {
+    int index, b;
+    size_t indexT = blockIdx.x*blockDim.x+threadIdx.x;
+    pM.divmod(indexT, b, index);
+    int indexCopy = index;
+    
+    auto param = &paramV;
+    int e = param->elh[0];
+    int l = param->elh[1];
+    int h = param->elh[2];
+    for (; index < AreaPackA && OA != nullptr; index += blockDim.x*gridDim.x) {
+        int lIndex, oIndex;
+        lA.divmod(index, oIndex, lIndex);
+
+        __half* AP = OAP + b * AreaPackA;
+        const T* A = OA + b * AreaA;
+        half value = 0.0;
+        if (oIndex < e && lIndex < l) {
+            value = A[oIndex * param->aStride[0] + lIndex * param->aStride[1]];
+        }
+        AP[index] = value;
+    }
+
+    index = indexCopy;
+    for (; index < AreaPackB && OB != nullptr; index += blockDim.x*gridDim.x) {
+        int lIndex, oIndex;
+        lA.divmod(index, oIndex, lIndex);
+        
+        __half* BP = OBP + b * AreaPackB;
+        const T* B = OB + b * AreaB;
+        half value = 0.0;
+        if (oIndex < h && lIndex < l) {
+            value = B[oIndex * param->bStride[2] + lIndex * param->bStride[1]];
+        }
+        BP[index] = value;
+    }
+}
+
+void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const void* A, __half* AP, const void* B, __half* BP, int bytes) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int unit_threads_num = ALIMAX(cpuParam->elhPack[0], cpuParam->elhPack[2]) * cpuParam->elhPack[1] * 256;
+    threads_num = ALIMIN(threads_num, unit_threads_num);
+
+    const int AreaPackA = cpuParam->elhPack[0] * cpuParam->elhPack[1] * 256;
+    const int AreaPackB = cpuParam->elhPack[1] * cpuParam->elhPack[2] * 256;
+    const int AreaA     = cpuParam->elh[0] * cpuParam->elh[1];
+    const int AreaB     = cpuParam->elh[1] * cpuParam->elh[2];
+
+    const int maxPack = ALIMAX(AreaPackA, AreaPackB);
+    const int maxCount = cpuParam->batch * maxPack;
+    DivModFast pM(maxPack);
     int block_num = runtime->blocks_num(maxCount);
-    int threads_num = runtime->threads_num();
-    if (nullptr != AP) {
-        runtime->memset(AP, 0, cpuParam->elhPack[0] * cpuParam->elhPack[1] * 256 * sizeof(__half));
+    int block_size = runtime->threads_num();
+    DivModFast lA(cpuParam->elhPack[1] * 16);
+    if (bytes == 4) {
+        //GemmPrearrange<<<cpuParam->batch, threads_num>>>(*cpuParam, (float*)A, AP, (float*)B, BP, lA);
+        GemmPrearrange_OPT<<<block_num, block_size>>>(*cpuParam, maxCount, AreaPackA, AreaPackB,  AreaA, AreaB, (float*)A, AP, (float*)B, BP, lA, pM);
+        checkKernelErrors;
+    } else {
+        MNN_ASSERT(bytes == 2);
+        //GemmPrearrange<<<cpuParam->batch, threads_num>>>(*cpuParam, (half*)A, AP, (half*)B, BP, lA);
+        GemmPrearrange_OPT<<<block_num, block_size>>>(*cpuParam, maxCount, AreaPackA, AreaPackB,  AreaA, AreaB, (half*)A, AP, (half*)B, BP, lA, pM);
+        checkKernelErrors;
     }
-    if (nullptr != BP) {
-        runtime->memset(BP, 0, cpuParam->elhPack[2] * cpuParam->elhPack[1] * 256 * sizeof(__half));
-    }
-    GemmPrearrange<<<block_num, threads_num>>>(param, A, AP, B, BP);
 }
 
-__global__ void GemmPacked(const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr) {
+template<typename T, typename LayoutA, typename LayoutB>
+__global__ void GemmPacked(const MatMulParam* param, T *bc, const half *ba, const half *bb, const T* biasPtr) {
     int eU = param->elhPack[0];
     int lU = param->elhPack[1];
     int hU = param->elhPack[2];
-    int maxCount = eU * hU * warpSize;
+    int maxCount = eU * hU * warpSize * param->batch;
     extern __shared__ float sharedMemory[];
     for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int subIndex = index / warpSize;
+        int oIndex = index / warpSize;
+        int subIndex = oIndex % (eU * hU);
+        int bIndex = oIndex / (eU * hU);
         int wrapId = threadIdx.x / warpSize;
         int laneId = threadIdx.x % warpSize;
         int warpM = subIndex % eU;
         int warpN = subIndex / eU;
+        T* c = bc + bIndex * param->elh[0] * param->elh[2];
+        const half* a = ba + bIndex * param->elhPack[1] * param->elhPack[0] * 16 * 16;
+        const half* b = bb + bIndex * param->elhPack[1] * param->elhPack[2] * 16 * 16;
         float* cache = sharedMemory + wrapId * 16 * 16;
         // Declare the fragments
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major>
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, LayoutA>
             a_frag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, LayoutB>
             b_frag;
         wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag;
 
         wmma::fill_fragment(acc_frag, 0.0f);
-        const half* aStart = a + warpM * lU * 16 * 16;
-        const half* bStart = b + warpN * lU * 16 * 16;
+        const half* aStart = a + warpM * param->aPStride[0];
+        const half* bStart = b + warpN * param->bPStride[0];
         //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements);
         // MLA
         for (int i = 0; i < lU; ++i) {
             // Load the inputs
-            wmma::load_matrix_sync(a_frag, aStart + i * 256, 16);
-            wmma::load_matrix_sync(b_frag, bStart + i * 256, 16);
+            wmma::load_matrix_sync(a_frag, aStart + i * param->aPStride[1], param->aPStride[2]);
+            wmma::load_matrix_sync(b_frag, bStart + i * param->bPStride[1], param->bPStride[2]);
             // Perform the matrix multiplication
             wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
         }
         wmma::store_matrix_sync(cache, acc_frag, 16, wmma::mem_row_major);
-        //wmma::store_matrix_sync(c + warpM * 16 * param->elh[2] + 16 * warpN, acc_frag, param->elh[2], wmma::mem_row_major);
         int eSta = warpM * 16;
         int eEnd = min(eSta + 16, param->elh[0]);
         int hSta = warpN * 16;
         int hEnd = min(hSta + 16, param->elh[2]);
         int eC = eEnd - eSta;
         int hC = hEnd - hSta;
-        float* dstStart = c + hSta * param->cStride[2];
+        T* dstStart = c + hSta * param->cStride[2];
         if (nullptr != biasPtr) {
             for (int tId = laneId; tId < eC * hC; tId += warpSize) {
                 int y = tId % eC;
                 int x = tId / eC;
                 int ye = y + eSta;
-                int yi = ye % param->split[2];
-                int yc = ye / param->split[2];
-                dstStart[yc * param->cStride[0] + yi * param->cStride[1] + x * param->cStride[2]] = min(max(cache[16 * y + x] + biasPtr[hSta + x], param->minValue), param->maxValue);
+                float value = cache[16 * y + x];
+                float biasValue = biasPtr[hSta + x];
+                dstStart[ye * param->cStride[0] + x * param->cStride[2]] = value + biasValue;
             }
         } else {
             for (int tId = laneId; tId < eC * hC; tId += warpSize) {
                 int y = tId % eC;
                 int x = tId / eC;
                 int ye = y + eSta;
-                int yi = ye % param->split[2];
-                int yc = ye / param->split[2];
-                dstStart[yc * param->cStride[0] + yi * param->cStride[1] + x * param->cStride[2]] = min(max(cache[16 * y + x], param->minValue), param->maxValue);
+                float value = cache[16 * y + x];
+                dstStart[ye * param->cStride[0] + x * param->cStride[2]] = value;
             }
         }
     }
 }
 
-void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr) {
+void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const void* biasPtr, int bytes, bool transposeA, bool transposeB) {
     auto& prop = runtime->prop();
     int threads_num = prop.maxThreadsPerBlock;
     int cores = prop.multiProcessorCount;
     int sharedMemorySize = 16 * 16 * sizeof(float) * threads_num / prop.warpSize;
-    cudaFuncSetAttribute(GemmPacked, cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemorySize);
-    GemmPacked<<<cores, threads_num, sharedMemorySize>>>(param, c, a, b, biasPtr);
-}
-
+    if (bytes == 4) {
+        if (transposeA) {
+            if (transposeB) {
+                GemmPacked<float, wmma::col_major, wmma::row_major><<<cores, threads_num, sharedMemorySize>>>(param, (float*)c, a, b, (float*)biasPtr);
+            } else {
+                GemmPacked<float, wmma::col_major, wmma::col_major><<<cores, threads_num, sharedMemorySize>>>(param, (float*)c, a, b, (float*)biasPtr);
+            }
+        } else {
+            if (transposeB) {
+                GemmPacked<float, wmma::row_major, wmma::row_major><<<cores, threads_num, sharedMemorySize>>>(param, (float*)c, a, b, (float*)biasPtr);
+            } else {
+                GemmPacked<float, wmma::row_major, wmma::col_major><<<cores, threads_num, sharedMemorySize>>>(param, (float*)c, a, b, (float*)biasPtr);
+            }
+        }
+    } else {
+        if (transposeA) {
+            if (transposeB) {
+                GemmPacked<half, wmma::col_major, wmma::row_major><<<cores, threads_num, sharedMemorySize>>>(param, (half*)c, a, b, (half*)biasPtr);
+            } else {
+                GemmPacked<half, wmma::col_major, wmma::col_major><<<cores, threads_num, sharedMemorySize>>>(param, (half*)c, a, b, (half*)biasPtr);
+            }
+        } else {
+            if (transposeB) {
+                GemmPacked<half, wmma::row_major, wmma::row_major><<<cores, threads_num, sharedMemorySize>>>(param, (half*)c, a, b, (half*)biasPtr);
+            } else {
+                GemmPacked<half, wmma::row_major, wmma::col_major><<<cores, threads_num, sharedMemorySize>>>(param, (half*)c, a, b, (half*)biasPtr);
+            }
+        }
+    }
+    checkKernelErrors;
+}
 }
 }
diff --git a/source/backend/cuda/execution/TensorCoreGemm.cuh b/source/backend/cuda/execution/TensorCoreGemm.cuh
index bd690739..fe196b58 100644
--- a/source/backend/cuda/execution/TensorCoreGemm.cuh
+++ b/source/backend/cuda/execution/TensorCoreGemm.cuh
@@ -7,6 +7,7 @@
 #include "backend/cuda/core/runtime/CUDARuntime.hpp"
 #include <float.h>
 #define MATMULPACK 16
+#define MATMULPACK2 (MATMULPACK * MATMULPACK)
 namespace MNN {
 namespace CUDA {
 
@@ -16,12 +17,20 @@ struct MatMulParam {
     int aStride[3];
     int bStride[3];
     int cStride[3];
-    int split[3];// a, b, c can split e / h in l
+
+    // Outside E, Outside L, Inside
+    int aPStride[3];
+
+    // Outside H, Outside L, Inside
+    int bPStride[3];
+
+    int batch = 1;
     float minValue = -FLT_MAX;
     float maxValue = FLT_MAX;
 };
-void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const float* A, __half* AP, const float* B, __half* BP);
-void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, float *c, const half *a, const half *b, const float* biasPtr);
+void GemmPrepareRerange(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, const void* A, __half* AP, const void* B, __half* BP, int bytes);
+void GemmPackedMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const void* biasPtr, int bytes, bool transposeA, bool transposeB);
+
 }
 }
 #endif
\ No newline at end of file
diff --git a/source/backend/cuda/execution/TensorCoreGemmPacked.cu b/source/backend/cuda/execution/TensorCoreGemmPacked.cu
new file mode 100644
index 00000000..c0342507
--- /dev/null
+++ b/source/backend/cuda/execution/TensorCoreGemmPacked.cu
@@ -0,0 +1,184 @@
+
+#include <assert.h>
+#include <cuda.h>
+#include <mma.h>
+#include <cuda_runtime_api.h>
+#include "TensorCoreGemm.cuh"
+
+using namespace nvcuda;
+namespace MNN {
+namespace CUDA {
+
+template<typename T>
+__global__ void GemmPackedFull(const MatMulParam* param, T *c, const half *a, const half *b, const T* biasPtr) {
+    int eU = param->elhPack[0];
+    int lU = param->elhPack[1];
+    int hU = param->elhPack[2];
+    int maxCount = eU * hU * warpSize;
+    int wrapId = threadIdx.x / warpSize;
+    int laneId = threadIdx.x % warpSize;
+    extern __shared__ float sharedMemory[];
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
+        size_t subIndex = index / warpSize;
+        size_t warpM = subIndex % eU;
+        size_t warpN = subIndex / eU;
+        T* cache = (T*)(sharedMemory + wrapId * 16 * 16);
+        // Declare the fragments
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major>
+            a_frag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
+            b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, T> acc_frag;
+
+        wmma::load_matrix_sync(acc_frag, biasPtr + 16 * warpN, 0, wmma::mem_row_major);
+        const half* aStart = a + warpM * lU * 16 * 16;
+        const half* bStart = b + warpN * lU * 16 * 16;
+        //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements);
+        // MLA
+        for (int i = 0; i < lU; ++i) {
+            wmma::load_matrix_sync(a_frag, aStart + i * 256, 16);
+            wmma::load_matrix_sync(b_frag, bStart + i * 256, 16);
+            wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+        }
+        for(int t=0; t<acc_frag.num_elements; t++){
+            acc_frag.x[t] = max(acc_frag.x[t], param->minValue);
+            acc_frag.x[t] = min(acc_frag.x[t], param->maxValue);
+        }
+        int eSta = warpM * 16;
+        int eEnd = min(eSta + 16, param->elh[0]);
+        int eC = eEnd - eSta;
+        T* dstStart = (T*)(c + warpN * 16 * param->elh[0] + eSta * 16);
+        wmma::store_matrix_sync(cache, acc_frag, 16, wmma::mem_row_major);
+        if (warpSize % 16 == 0) {
+            int r = warpSize / 16;
+            int x = laneId / r;
+            int ysta = laneId % r;
+            for (int y = ysta; y < eC; y+=r) {
+                float value = *((T*)(cache + 16 * y + x));
+                dstStart[y * 16 + x] = value;
+            }
+        } else {
+            for (int tId = laneId; tId < eC * 16; tId += warpSize) {
+                int y = tId % eC;
+                int x = tId / eC;
+                float value = *((T*)(cache + 16 * y + x));
+                dstStart[y * 16 + x] = value;
+            }
+        }
+    }
+}
+
+template<typename T>
+__global__ void GemmPackedFull16x32(const MatMulParam* param, T *c, const half *a, const half *b, const T* biasPtr) {
+    size_t eU = param->elhPack[0];
+    size_t lU = param->elhPack[1];
+    size_t hU = param->elhPack[2];
+    size_t threadCount = blockDim.x / warpSize;
+    size_t maxCount = eU * (hU / 2);
+    size_t wrapId = threadIdx.x / warpSize;
+    size_t laneId = threadIdx.x % warpSize;
+    extern __shared__ float sharedMemory[];
+    T* cache = (T*)(sharedMemory + wrapId * 16 * 32);
+    for (size_t index = blockIdx.x * threadCount + wrapId; index < maxCount; index += gridDim.x * threadCount) {
+        size_t warpM = index % eU;
+        size_t warpN = index / eU;
+        // Declare the fragments
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major>
+            MA0;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
+            MB0;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::col_major>
+            MB1;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, T> MC0;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, T> MC1;
+
+        wmma::load_matrix_sync(MC0, biasPtr + 32 * warpN + 0, 0, wmma::mem_row_major);
+        wmma::load_matrix_sync(MC1, biasPtr + 32 * warpN + 16, 0, wmma::mem_row_major);
+        const half* aStart = a + warpM * lU * 16 * 16;
+        const half* bStart = b + warpN * lU * 16 * 32;
+        //printf("GemmPacked: %d - %d - %d, numele: %d, %d\n", eU, lU, hU, a_frag.num_elements, b_frag.num_elements);
+        // MLA
+        for (int i = 0; i < lU; ++i) {
+            wmma::load_matrix_sync(MA0, aStart + i * 256 + 0, 16);
+            wmma::load_matrix_sync(MB0, bStart + i * 512, 16);
+            wmma::load_matrix_sync(MB1, bStart + i * 512 + 256, 16);
+            wmma::mma_sync(MC0, MA0, MB0, MC0);
+            wmma::mma_sync(MC1, MA0, MB1, MC1);
+        }
+        for(int t=0; t<MC0.num_elements; t++){
+            MC0.x[t] = max(MC0.x[t], param->minValue);
+            MC0.x[t] = min(MC0.x[t], param->maxValue);
+        }
+        for(int t=0; t<MC1.num_elements; t++){
+            MC1.x[t] = max(MC1.x[t], param->minValue);
+            MC1.x[t] = min(MC1.x[t], param->maxValue);
+        }
+        size_t eSta = warpM * 16;
+        size_t eEnd = ((eSta + (size_t)16) > (size_t)param->elh[0]) ? (size_t)param->elh[0] : (eSta + (size_t)16);
+        size_t eC = eEnd - eSta;
+        T* dst0 = (T*)(c + warpN * 32 * param->elh[0] + eSta * 16);
+        T* dst1 = (T*)(c + (warpN * 32 + 16) * param->elh[0] + eSta * 16);
+        // First 8x32
+        wmma::store_matrix_sync(cache, MC0, 16, wmma::mem_row_major);
+        // Second 8x32
+        wmma::store_matrix_sync(cache + 256, MC1, 16, wmma::mem_row_major);
+        auto dst = dst0;
+        auto src = cache;
+        if (laneId >= 16) {
+            dst = dst1;
+            src = cache + 256;
+        }
+        int x = laneId % 16;
+        for (size_t y = 0; y < eC; ++y) {
+            dst[y * 16 + x] = src[y * 16 + x];
+        }
+    }
+}
+
+void GemmPackedFullMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    //MNN_PRINT("%d - %d - %d  - %d\n", cpuParam->elhPack[0], cpuParam->elhPack[1], cpuParam->elhPack[2], cpuParam->elh[2]);
+    {
+        int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * cpuParam->elhPack[2], cores);
+        int threads_num = std::min(prop.maxThreadsPerBlock, maxThreadInWarp * prop.warpSize);
+        int basicMemory = 16 * 16 * sizeof(float) * prop.maxThreadsPerBlock / prop.warpSize;
+        if (4 == bytes) {
+            cudaFuncSetAttribute(GemmPackedFull<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
+            GemmPackedFull<<<cores, threads_num, basicMemory>>>(param, (float*)c, a, b, (float*)biasPtr);
+            checkKernelErrors;
+        } else {
+            //MNN_PRINT("%d - %d, %d- %d\n", cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elh[0], cpuParam->elh[2]);
+            cudaFuncSetAttribute(GemmPackedFull<half>, cudaFuncAttributeMaxDynamicSharedMemorySize, prop.sharedMemPerMultiprocessor);
+            GemmPackedFull<<<cores, threads_num, basicMemory>>>(param, (half*)c, a, b, (half*)biasPtr);
+            checkKernelErrors;
+        }
+    }
+}
+
+
+void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    // MNN_PRINT("%d - %d - %d\n", cpuParam->elhPack[0], cpuParam->elhPack[1], cpuParam->elhPack[2]);
+    {
+        int hUP = cpuParam->elhPack[2] / 2;
+        int maxThreadInWarp = UP_DIV(cpuParam->elhPack[0] * hUP, cores);
+        int threads_num = ALIMIN(512, maxThreadInWarp * prop.warpSize);
+        //MNN_PRINT("GemmPacked16x32：%d-%d-%d-%d-%d\n\n", hUP, cpuParam->elhPack[0], cpuParam->elhPack[2], cpuParam->elhPack[0]*cpuParam->elhPack[2], threads_num);
+        threads_num = ALIMIN(prop.maxThreadsPerBlock, threads_num);
+        int basicMemory = 32 * 16 * sizeof(float) * (threads_num / prop.warpSize);
+        if (4 == bytes) {
+            cudaFuncSetAttribute(GemmPackedFull16x32<float>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
+            GemmPackedFull16x32<<<cores, threads_num, basicMemory>>>(param, (float*)c, a, b, (float*)biasPtr);
+            checkKernelErrors;
+        } else {
+            cudaFuncSetAttribute(GemmPackedFull16x32<half>, cudaFuncAttributeMaxDynamicSharedMemorySize, basicMemory);
+            GemmPackedFull16x32<<<cores, threads_num, basicMemory>>>(param, (half*)c, a, b, (half*)biasPtr);
+            checkKernelErrors;
+        }
+    }
+}
+
+}
+}
\ No newline at end of file
diff --git a/source/backend/cuda/execution/TensorCoreGemmPacked.cuh b/source/backend/cuda/execution/TensorCoreGemmPacked.cuh
new file mode 100644
index 00000000..637c3715
--- /dev/null
+++ b/source/backend/cuda/execution/TensorCoreGemmPacked.cuh
@@ -0,0 +1,8 @@
+#include "TensorCoreGemm.cuh"
+namespace MNN {
+namespace CUDA {
+
+void GemmPackedFullMain(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes);
+void GemmPacked16x32(CUDARuntime* runtime, const MatMulParam* cpuParam, const MatMulParam* param, void *c, const half *a, const half *b, const half* biasPtr, int bytes);
+}
+}
\ No newline at end of file
diff --git a/source/backend/cuda/execution/Transpose.cu b/source/backend/cuda/execution/Transpose.cu
new file mode 100644
index 00000000..84930b40
--- /dev/null
+++ b/source/backend/cuda/execution/Transpose.cu
@@ -0,0 +1,291 @@
+//
+//  Transpose.cu
+//  MNN
+//
+//  Created by MNN on b'2021/12/09'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "Transpose.cuh"
+#include "core/Macro.h"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+namespace MNN {
+namespace CUDA {
+
+template<typename T0, typename T1>
+__global__ void UNPACKCOMMON_4(const T0 *input, T1 *output,
+    const int total, int inside, int axis, int outside,
+    int insideStride, int axisStride,
+    DivModFast is, DivModFast os
+    ) {
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int tmpI = i >> 2;
+        int yR = i & 3;
+        int x, tmp, yC, z;
+        is.divmod(tmpI, tmp, x);
+        os.divmod(tmp, yC, z);
+        int y = (yC << 2) + yR;
+        int srcOffset = ((z * inside + yC * inside * outside + x) << 2) + yR;
+        int dstOffset = x * insideStride + y * axisStride + z * inside * axis;
+        if (y < axis) {
+            output[dstOffset] = input[srcOffset];
+        }
+    }
+}
+
+template<typename T0, typename T1>
+__global__ void UNPACKCOMMON(const T0 *input, T1 *output, 
+    int inside, int axis, int outside, 
+    int insideStride, int axisStride
+    ) {
+    int axisAlign = UP_DIV(axis, PACK_NUMBER) * PACK_NUMBER;;
+    int total = axisAlign * inside * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int tmpI = i >> 4;
+        int yR = i & 15;
+        int x = tmpI % inside;
+        int tmp = tmpI / inside;
+        int yC = tmp / outside;
+        int z = tmp % outside;
+        int y = yC * PACK_NUMBER + yR;
+        int srcOffset = PACK_NUMBER * (z * inside + yC * inside * outside + x) + yR;
+        int dstOffset = x * insideStride + y * axisStride + z * inside * axis;
+        if (y < axis) {
+            output[dstOffset] = input[srcOffset];
+        }
+    }
+}
+
+template<typename T0, typename T1>
+__global__ void PACKCOMMON_4(const T0 *input, T1 *output, 
+    int inside, int axis, int outside, 
+    int insideStride, int axisStride
+    ) {
+    int axisAlign = UP_DIV(axis, 4) * 4;;
+    int total = axisAlign * inside * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int tmpI = i >> 2;
+        int yR = i & 3;
+        int x = tmpI % inside;
+        int tmp = tmpI / inside;
+        int yC = tmp / outside;
+        int z = tmp % outside;
+        int y = yC * 4 + yR;
+        int dstOffset = 4 * (z * inside + yC * inside * outside + x) + yR;
+        int srcOffset = x * insideStride + y * axisStride + z * inside * axis;
+        if (y < axis) {
+            output[dstOffset] = input[srcOffset];
+        } else {
+            output[dstOffset] = {0, 0, 0, 0};
+        }
+    }
+}
+template<typename T0, typename T1>
+__global__ void PACKCOMMON(const T0 *input, T1 *output,
+    int inside, int axis, int outside, 
+    int insideStride, int axisStride
+    ) {
+    int axisAlign = UP_DIV(axis, PACK_NUMBER) * PACK_NUMBER;;
+    int total = axisAlign * inside * outside;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int tmpI = i >> 4;
+        int yR = i & 15;
+        int x = tmpI % inside;
+        int tmp = tmpI / inside;
+        int yC = tmp / outside;
+        int z = tmp % outside;
+        int y = yC * PACK_NUMBER + yR;
+        int dstOffset = PACK_NUMBER * (z * inside + yC * inside * outside + x) + yR;
+        int srcOffset = x * insideStride + y * axisStride + z * inside * axis;
+        if (y < axis) {
+            output[dstOffset] = input[srcOffset];
+        } else {
+            output[dstOffset] = 0.0;
+        }
+    }
+}
+
+void PackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+    if (info->axis % 4 == 0 && info->axisStride == 1 && \
+        bytes == 4 && info->insideStride == info->axis) {
+        PACKCOMMON_4<<<cores, threadNumbers>>>((const int4*)input, (int4*)output,
+                    info->inside, info->axis / 4, info->outside, 
+                    info->insideStride / 4, info->axisStride);
+        return;
+    }
+    switch (bytes) {
+        case 4:
+            PACKCOMMON<<<cores, threadNumbers>>>((const float*)input, (float*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        case 2:
+            PACKCOMMON<<<cores, threadNumbers>>>((const half*)input, (half*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        case 1:
+            PACKCOMMON<<<cores, threadNumbers>>>((const int8_t*)input, (int8_t*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        default:
+            break;
+    }
+}
+void UnpackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+
+    if (info->axis % 4 == 0 && info->axisStride == 1 && bytes == 4 && info->insideStride == info->axis) {
+        DivModFast is(info->inside);
+        DivModFast os(info->outside);
+        const int maxCount = info->inside * UP_DIV(info->axis / 4, 4) * 4 * info->outside;
+        int block_num = runtime->blocks_num(maxCount);
+        int block_size = runtime->threads_num();
+        UNPACKCOMMON_4<<<block_num, block_size>>>((const int4*)input, (int4*)output, 
+                        maxCount, info->inside, info->axis / 4, info->outside,
+                        info->insideStride / 4, info->axisStride, is, os);
+        return;
+    }
+    switch (bytes) {
+        case 4:
+            UNPACKCOMMON<<<cores, threadNumbers>>>((const float*)input, (float*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        case 2:
+            UNPACKCOMMON<<<cores, threadNumbers>>>((const half*)input, (half*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        case 1:
+            UNPACKCOMMON<<<cores, threadNumbers>>>((const int8_t*)input, (int8_t*)output, 
+                        info->inside, info->axis, info->outside, 
+                        info->insideStride, info->axisStride);
+            break;
+        default:
+            break;
+    }
+}
+
+void PackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+    PACKCOMMON<<<cores, threadNumbers>>>((const float*)input, (half*)output, 
+                info->inside, info->axis, info->outside, 
+                info->insideStride, info->axisStride);
+}
+void PackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+    PACKCOMMON<<<cores, threadNumbers>>>((const half*)input, (float*)output, 
+                info->inside, info->axis, info->outside, 
+                info->insideStride, info->axisStride);
+}
+
+void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+    UNPACKCOMMON<<<cores, threadNumbers>>>((const half*)input, (float*)output, 
+                    info->inside, info->axis, info->outside, 
+                    info->insideStride, info->axisStride);
+}
+void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int cores = prop.multiProcessorCount;
+    int threadNumbers = prop.maxThreadsPerBlock;
+    UNPACKCOMMON<<<cores, threadNumbers>>>((const float*)input, (half*)output, 
+                    info->inside, info->axis, info->outside, 
+                    info->insideStride, info->axisStride);
+}
+
+
+
+template<typename T>
+__global__ void TRANSPOSE(const T *input, T *output, const TransposeParam* param) {
+    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < param->total) {
+        int x = i % param->dims[0];
+        int tmp = i / param->dims[0];
+        int y = tmp % param->dims[1];
+        int z = tmp / param->dims[1];
+        int srcOffset = param->srcStride * z + y + x * param->dims[2];
+        int dstOffset = param->dstStride * z + x + y * param->dims[3];
+        output[dstOffset] = input[srcOffset];
+    }
+}
+#define LOCAL_DIM 8
+
+template <typename T>
+__global__ void TRANSPOSE_LOCAL(const T* input, T *output, const TransposeParam* param) {
+    __shared__ T localM[LOCAL_DIM][LOCAL_DIM + 1];
+    int num = blockIdx.z;
+    for (int n = num; n < param->size; n += gridDim.z) {
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        if (x < param->dims[0] && y < param->dims[1]) {
+            int offset                      = n * param->srcStride + x * param->dims[2] + y;
+            localM[threadIdx.y][threadIdx.x] = input[offset];
+        }
+        __syncthreads();
+        x = blockIdx.y * blockDim.y + threadIdx.x;
+        y = blockIdx.x * blockDim.x + threadIdx.y;
+        if (x < param->dims[1] && y < param->dims[0]) {
+            int offset = n * param->dstStride + x * param->dims[3] + y;
+            output[offset] = localM[threadIdx.x][threadIdx.y];
+        }
+    }
+}
+
+void Transpose(uint8_t* output, const uint8_t* input, const TransposeParam* cpuParam, const TransposeParam* gpuRegion, int bytes, CUDARuntime* runtime) {
+    int count = cpuParam->total;
+    int block_num = runtime->blocks_num(count);
+    int threads_num = runtime->threads_num();
+    auto out = output + bytes * cpuParam->dstOffset;
+    auto inp = input + bytes * cpuParam->srcOffset;
+    if (runtime->prop().maxThreadsPerBlock >= LOCAL_DIM * LOCAL_DIM && (cpuParam->dims[0] >= LOCAL_DIM || cpuParam->dims[1] >= LOCAL_DIM)) {
+        dim3 localSize(LOCAL_DIM, LOCAL_DIM, 1);
+        //printf("%d, %d - %d, %d - %d\n", cpuParam->size, cpuParam->dims[0], cpuParam->dims[1], cpuParam->dims[2], cpuParam->dims[3]);
+        int globalZ = ALIMIN(runtime->prop().multiProcessorCount, cpuParam->size);
+        dim3 globalSize(UP_DIV(cpuParam->dims[0], LOCAL_DIM), UP_DIV(cpuParam->dims[1], LOCAL_DIM), globalZ);
+        switch (bytes) {
+            case 4:
+                TRANSPOSE_LOCAL<<<globalSize, localSize>>>((const float *)inp, (float *)out, gpuRegion);
+                break;
+            case 2:
+                TRANSPOSE_LOCAL<<<globalSize, localSize>>>((const half *)inp, (half *)out, gpuRegion);
+                break;
+            case 1:
+                TRANSPOSE_LOCAL<<<globalSize, localSize>>>((const int8_t *)inp, (int8_t *)out, gpuRegion);
+                break;
+            default:
+                break;
+        }
+        return;
+    }
+    switch (bytes) {
+        case 4:
+            TRANSPOSE<<<block_num, threads_num>>>((int*)inp, (int*)out, gpuRegion);
+            break;
+        case 2:
+            TRANSPOSE<<<block_num, threads_num>>>((int16_t*)inp, (int16_t*)out, gpuRegion);
+            break;
+        case 1:
+            TRANSPOSE<<<block_num, threads_num>>>((int8_t*)inp, (int8_t*)out, gpuRegion);
+            break;
+        default:
+            break;
+    }
+}
+
+};
+};
\ No newline at end of file
diff --git a/source/backend/cuda/execution/Transpose.cuh b/source/backend/cuda/execution/Transpose.cuh
new file mode 100644
index 00000000..480369e6
--- /dev/null
+++ b/source/backend/cuda/execution/Transpose.cuh
@@ -0,0 +1,44 @@
+//
+//  Transpose.cuh
+//  MNN
+//
+//  Created by MNN on b'2021/12/09'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef Transpose_cuh
+#define Transpose_chu
+#include "backend/cuda/core/runtime/CUDARuntime.hpp"
+namespace MNN {
+namespace CUDA {
+
+struct PackInfo {
+    int outside;
+    int inside;
+    int axis;
+    int unit;
+    int insideStride;
+    int axisStride;
+};
+void UnpackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime);
+void PackBuffer(void* output, const void* input, const PackInfo* info, int bytes, CUDARuntime* runtime);
+void PackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime);
+void PackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime);
+void UnpackFP16ToFP32(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime);
+void UnpackFP32ToFP16(void* output, const void* input, const PackInfo* info, CUDARuntime* runtime);
+
+struct TransposeParam {
+    int dims[4];
+    int srcOffset;
+    int srcStride;
+    int dstOffset;
+    int dstStride;
+    int size;
+    int total;
+};
+void Transpose(uint8_t* output, const uint8_t* input, const TransposeParam* cpuParam, const TransposeParam* gpuRegion, int bytes, CUDARuntime* runtime);
+
+}
+}
+
+#endif
diff --git a/source/backend/cuda/execution/UnaryExecution.cu b/source/backend/cuda/execution/UnaryExecution.cu
index 42e0ea5a..6d071d8b 100644
--- a/source/backend/cuda/execution/UnaryExecution.cu
+++ b/source/backend/cuda/execution/UnaryExecution.cu
@@ -21,7 +21,7 @@ void callUnary(void *input, void *output, size_t count, MNN::CUDARuntime* runtim
 {
     Tensor::InsideDescribe::Region reg;
     reg.size[2] = count;
-    UnaryBlit((uint8_t*)output, (const uint8_t*)input, reg.size, reg.src.stride, reg.dst.stride, 4, runtime, op_type);
+    UnaryBlit((uint8_t*)output, (const uint8_t*)input, reg.size, reg.src.stride, reg.dst.stride, data_type.bytes(), runtime, op_type);
     return;
 }
 
@@ -41,6 +41,9 @@ ErrorCode UnaryExecution::onExecute(const std::vector<Tensor*>& inputs, const st
     MNN_PRINT("start UnaryExecution onExecute...");
 #endif
     auto type = inputs[0]->getType();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        type.bits = 16;
+    }
     callUnary((void*)inputs[0]->deviceId(), (void*)outputs[0]->deviceId(), mCount, mRuntime, type, mOpType);
 #ifdef LOG_VERBOSE
     MNN_PRINT("end UnaryExecution onExecute...");
@@ -58,6 +61,15 @@ __global__ void RELU(const float *input, float *output, size_t count, float slop
   return;
 }
 
+__global__ void RELU_Half(const half *input, half *output, size_t count, float slope) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    float x = input[i];
+    float y = x > 0 ? x : x * slope;
+    output[i] = (half)y;
+  }
+  return;
+}
+
 class ReluExecution : public Execution {
 public:
     ReluExecution(Backend* bn, float slope) : Execution(bn) {
@@ -71,7 +83,11 @@ public:
         int threads_num = runtime->threads_num();
         auto input = inputs[0]->deviceId();
         auto output = outputs[0]->deviceId();
-        RELU<<<block_num, threads_num>>>((float*)input, (float*)output, count, mSlope);
+        if (static_cast<CUDABackend*>(backend())->useFp16()) {
+            RELU_Half<<<block_num, threads_num>>>((half*)input, (half*)output, count, mSlope);
+        } else {
+            RELU<<<block_num, threads_num>>>((float*)input, (float*)output, count, mSlope);
+        }
         return NO_ERROR;
     }
 private:
@@ -79,7 +95,8 @@ private:
 };
 
 
-__global__ void CLAMP(const float *input, float *output, size_t count, float minV, float maxV) {
+template<typename T>
+__global__ void CLAMP(const T *input, T *output, size_t count, float minV, float maxV) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
     float x = input[i];
     float y = min(max(x, minV), maxV);
@@ -101,7 +118,11 @@ public:
         int threads_num = runtime->threads_num();
         auto input = inputs[0]->deviceId();
         auto output = outputs[0]->deviceId();
-        CLAMP<<<block_num, threads_num>>>((float*)input, (float*)output, count, mMinV, mMaxV);
+        if (static_cast<CUDABackend*>(backend())->useFp16()) {
+            CLAMP<<<block_num, threads_num>>>((half*)input, (half*)output, count, mMinV, mMaxV);
+        } else {
+            CLAMP<<<block_num, threads_num>>>((float*)input, (float*)output, count, mMinV, mMaxV);
+        }
         return NO_ERROR;
     }
 private:
@@ -117,6 +138,14 @@ __global__ void CAST(T1 *input, T2 *output, size_t count) {
   return;
 }
 
+template <typename T1, typename T2>
+__global__ void CASTMIDFLOAT(T1 *input, T2 *output, size_t count) {
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
+    output[i] = (T2)((float)input[i]);
+  }
+  return;
+}
+
 __global__ void CASTBOOL(int32_t *input, int32_t *output, size_t count) {
   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
     output[i] = input[i] > 0 ? 1 : 0;
@@ -152,29 +181,52 @@ public:
         auto dstT = _mapDataType(mDst);
 
         const auto &inputDataType = inputs[0]->getType();
-
+        if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
+            runtime->memcpy((void*)output, (void*)input, count * static_cast<CUDABackend*>(backend())->getBytes(inputs[0]), MNNMemcpyDeviceToDevice, true);
+            return NO_ERROR;
+        }
         if (inputDataType.bytes() == 4 && mDst == MNN::DataType_DT_BOOL) {
             CASTBOOL<<<block_num, threads_num>>>((int32_t*)input, (int32_t*)output, count);
-        } else if (inputs[0]->buffer().type == outputs[0]->buffer().type) {
-            runtime->memcpy((void*)output, (void*)input, count * inputDataType.bytes(), MNNMemcpyDeviceToDevice, true);
-        } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((float*)input, (int*)output, count);
-        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((int*)input, (float*)output, count);
-        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
-        } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
-        } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
-        } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
+            return NO_ERROR;
+        }
+        if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
+            CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
+            return NO_ERROR;
         } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<int32_t>() == inputDataType) {
             CAST<<<block_num, threads_num>>>((int32_t*)input, (uint8_t*)output, count);
+            return NO_ERROR;
         } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<uint8_t>() == inputDataType) {
             CAST<<<block_num, threads_num>>>((uint8_t*)input, (int32_t*)output, count);
-        } else if (dstT == MNN::DataType_DT_INT32 && halide_type_of<int8_t>() == inputDataType) {
-            CAST<<<block_num, threads_num>>>((int8_t*)input, (int32_t*)output, count);
+            return NO_ERROR;
+        }
+        if (static_cast<CUDABackend*>(backend())->useFp16()) {
+            if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (half*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (half*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (half*)output, count);
+            } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (int8_t*)output, count);
+            } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((half*)input, (uint8_t*)output, count);
+            }
+        } else {
+            if (dstT == MNN::DataType_DT_INT32 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int32_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((int*)input, (float*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<uint8_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((uint8_t*)input, (float*)output, count);
+            } else if (dstT == MNN::DataType_DT_FLOAT && halide_type_of<int8_t>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((int8_t*)input, (float*)output, count);
+            } else if (dstT == MNN::DataType_DT_INT8 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (int8_t*)output, count);
+            } else if (dstT == MNN::DataType_DT_UINT8 && halide_type_of<float>() == inputDataType) {
+                CASTMIDFLOAT<<<block_num, threads_num>>>((float*)input, (uint8_t*)output, count);
+            }
         }
         return NO_ERROR;
     }
diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp
index 8fc6495e..d2cca918 100644
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@@ -95,8 +95,8 @@ public:
         // Do nothing
     }
     virtual ~ MetalRuntimeAllocator() = default;
-    virtual std::pair<void*, int> onAlloc(int size, int align) override;
-    virtual void onRelease(std::pair<void*, int> ptr) override;
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override;
+    virtual void onRelease(std::pair<void*, size_t> ptr) override;
     
 private:
     id<MTLDevice> mDevice;
diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm
index 96a49870..ba697a8d 100644
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@@ -841,12 +841,12 @@ bool MetalRuntime::onSetCache(const void* buffer, size_t size) {//set Cache
     return setCache(std::make_pair(buffer, size));
 }
 
-std::pair<void*, int> MetalRuntimeAllocator::onAlloc(int size, int align) {
+std::pair<void*, size_t> MetalRuntimeAllocator::onAlloc(size_t size, size_t align) {
     auto buffer = [mDevice newBufferWithLength:size options:MTLCPUCacheModeDefaultCache];
     auto mMetalBufferAlloc = new MetalBufferAlloc(buffer);
     return std::make_pair((void *)mMetalBufferAlloc, 0);
 }
-void MetalRuntimeAllocator::onRelease(std::pair<void*, int> ptr) {
+void MetalRuntimeAllocator::onRelease(std::pair<void*, size_t> ptr) {
     delete (MetalBufferAlloc *)ptr.first;
 }
 
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index dc402dd9..dd557294 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -579,21 +579,21 @@ cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue queue,
 //}
 
 // clSVMAlloc wrapper, use OpenCLWrapper function.
-void *clSVMAlloc(cl_context context, cl_mem_flags flags, size_t size, cl_uint align) {
+void* CL_API_CALL clSVMAlloc(cl_context context, cl_mem_flags flags, size_t size, cl_uint align) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSVMAlloc;
     MNN_CHECK_NOTNULL(func);
     return func(context, flags, size, align);
 }
 
 // clSVMFree wrapper, use OpenCLWrapper function.
-void clSVMFree(cl_context context, void *buffer) {
+void CL_API_CALL clSVMFree(cl_context context, void *buffer) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSVMFree;
     MNN_CHECK_NOTNULL(func);
     func(context, buffer);
 }
 
 // clEnqueueSVMMap wrapper, use OpenCLWrapper function.
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_flags flags, void *host_ptr,
+cl_int CL_API_CALL clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_flags flags, void *host_ptr,
                        size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueSVMMap;
     MNN_CHECK_NOTNULL(func);
@@ -601,7 +601,7 @@ cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking, cl_map_
 }
 
 // clEnqueueSVMUnmap wrapper, use OpenCLWrapper function.
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint num_events_in_wait_list,
+cl_int CL_API_CALL clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint num_events_in_wait_list,
                          const cl_event *event_wait_list, cl_event *event) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clEnqueueSVMUnmap;
     MNN_CHECK_NOTNULL(func);
@@ -609,7 +609,7 @@ cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *host_ptr, cl_uint
 }
 
 // clSetKernelArgSVMPointer wrapper, use OpenCLWrapper function.
-cl_int clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, const void *host_ptr) {
+cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint index, const void *host_ptr) {
     auto func = MNN::OpenCLSymbolsOperator::getOpenclSymbolsPtr()->clSetKernelArgSVMPointer;
     MNN_CHECK_NOTNULL(func);
     return func(kernel, index, host_ptr);
diff --git a/source/backend/vulkan/component/VulkanMemoryPool.cpp b/source/backend/vulkan/component/VulkanMemoryPool.cpp
index 993fc69b..2d97e118 100644
--- a/source/backend/vulkan/component/VulkanMemoryPool.cpp
+++ b/source/backend/vulkan/component/VulkanMemoryPool.cpp
@@ -25,7 +25,7 @@ public:
     virtual ~ VulkanAllocator() {
         // Do nothing
     }
-    virtual std::pair<void*, int> onAlloc(int size, int align) override {
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
         VkMemoryAllocateInfo info;
         info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
         info.pNext = nullptr;
@@ -34,7 +34,7 @@ public:
         auto mem = new VulkanMemory(mDevice, info);
         return std::make_pair(mem, 0);
     }
-    virtual void onRelease(std::pair<void*, int> ptr) override {
+    virtual void onRelease(std::pair<void*, size_t> ptr) override {
         auto p = (VulkanMemory*)ptr.first;
         delete p;
     }
diff --git a/source/common/WinogradInt8Helper.hpp b/source/common/WinogradInt8Helper.hpp
index 3a370c50..6cc92ec4 100644
--- a/source/common/WinogradInt8Helper.hpp
+++ b/source/common/WinogradInt8Helper.hpp
@@ -13,7 +13,7 @@
 #include "core/Macro.h"
 
 namespace MNN {
-class MNN_PUBLIC WinogradInt8Helper {
+class WinogradInt8Helper {
 public:
     static void transformWeight(const std::vector<float>& weight, std::vector<float>& transWeight,
                                 std::vector<int>& attrs, int oc, int ic, int kernelY, int kernelX) {
diff --git a/source/core/BufferAllocator.cpp b/source/core/BufferAllocator.cpp
index c168b868..57a1a396 100644
--- a/source/core/BufferAllocator.cpp
+++ b/source/core/BufferAllocator.cpp
@@ -20,10 +20,10 @@ public:
     virtual ~ DefaultAllocator() {
         // Do nothing
     }
-    virtual std::pair<void*, int> onAlloc(int size, int align) {
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) {
         return std::make_pair(MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT), 0);
     }
-    virtual void onRelease(std::pair<void*, int> ptr) {
+    virtual void onRelease(std::pair<void*, size_t> ptr) {
         MNN_ASSERT(ptr.second == 0);
         MNNMemoryFreeAlign(ptr.first);
     }
@@ -36,10 +36,10 @@ public:
     virtual ~ RecurseAllocator() {
         // Do nothing
     }
-    virtual std::pair<void*, int> onAlloc(int size, int align) override {
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
         return mParent->alloc(size, false, align);
     }
-    virtual void onRelease(std::pair<void*, int> ptr) override {
+    virtual void onRelease(std::pair<void*, size_t> ptr) override {
         mParent->free(ptr);
     }
 private:
@@ -62,7 +62,7 @@ BufferAllocator::Node::~Node() {
         outside->onRelease(pointer);
     }
 }
-std::pair<void*, int> BufferAllocator::alloc(int size, bool seperate, int align) {
+std::pair<void*, size_t> BufferAllocator::alloc(size_t size, bool seperate, size_t align) {
 #ifdef DUMP_USAGE
     auto memoryUsed = size / 1024.0f / 1024.0f;
     MNN_PRINT("Alloc: %f\n", memoryUsed);
@@ -70,7 +70,7 @@ std::pair<void*, int> BufferAllocator::alloc(int size, bool seperate, int align)
     if (0 == align) {
         align = mAlign;
     }
-    std::pair<void*, int> pointer;
+    std::pair<void*, size_t> pointer;
     // reuse if possible
     if (!seperate) {
         if (nullptr != mCurrentFreeList) {
@@ -138,7 +138,7 @@ void BufferAllocator::returnMemory(FREELIST* listP, SharedPtr<Node> node, bool p
     }
 }
 
-bool BufferAllocator::free(std::pair<void*, int> pointer) {
+bool BufferAllocator::free(std::pair<void*, size_t> pointer) {
     // get node
     auto x = mUsedList.find(pointer);
     if (x == mUsedList.end()) {
@@ -202,11 +202,11 @@ void BufferAllocator::endGroup() {
     mCurrentFreeList = nullptr;
 }
 
-std::pair<void*, int> BufferAllocator::getFromFreeList(FREELIST* list, int size, bool permiteSplit, int align) {
+std::pair<void*, size_t> BufferAllocator::getFromFreeList(FREELIST* list, size_t size, bool permiteSplit, size_t align) {
 #ifdef MNN_DEBUG_MEMORY
     return std::make_pair(nullptr, 0);
 #endif
-    int realSize = size;
+    size_t realSize = size;
     bool needExtraSize = mAlign % align != 0;
     if (needExtraSize) {
         realSize = size + align - 1;
@@ -220,7 +220,7 @@ std::pair<void*, int> BufferAllocator::getFromFreeList(FREELIST* list, int size,
     auto pointer = x->second->pointer;
     // Align offset
     if (needExtraSize) {
-        int originOffset = pointer.second;
+        size_t originOffset = pointer.second;
         pointer.second = UP_DIV(originOffset, align) * align;
         realSize = size + pointer.second - originOffset;
     }
diff --git a/source/core/BufferAllocator.hpp b/source/core/BufferAllocator.hpp
index 1cb71817..447a370c 100644
--- a/source/core/BufferAllocator.hpp
+++ b/source/core/BufferAllocator.hpp
@@ -25,8 +25,8 @@ public:
     public:
         Allocator() = default;
         virtual ~ Allocator() = default;
-        virtual std::pair<void*, int> onAlloc(int size, int align) = 0;
-        virtual void onRelease(std::pair<void*, int> ptr) = 0;
+        virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) = 0;
+        virtual void onRelease(std::pair<void*, size_t> ptr) = 0;
         static std::shared_ptr<Allocator> createDefault();
         static std::shared_ptr<Allocator> createRecurse(BufferAllocator* parent);
     };
@@ -34,7 +34,7 @@ public:
      * @brief init buffer allocator with pointer alignment.
      * @param align given pointer alignment.
      */
-    BufferAllocator(std::shared_ptr<Allocator> parent, int align = MNN_MEMORY_ALIGN_DEFAULT) : mAllocator(parent), mAlign(align) {
+    BufferAllocator(std::shared_ptr<Allocator> parent, size_t align = MNN_MEMORY_ALIGN_DEFAULT) : mAllocator(parent), mAlign(align) {
         // nothing to do
     }
     /**
@@ -53,7 +53,7 @@ public:
      * @sa free
      * @sa release
      */
-    std::pair<void*, int> alloc(int size, bool seperate = false, int align = 0);
+    std::pair<void*, size_t> alloc(size_t size, bool seperate = false, size_t align = 0);
 
     /**
      * @brief mark CHUNK pointer as reusable.
@@ -61,7 +61,7 @@ public:
      * @return true if pointer is a CHUNK pointer, false otherwise.
      * @sa release
      */
-    bool free(std::pair<void*, int> pointer);
+    bool free(std::pair<void*, size_t> pointer);
 
     /**
      * @brief free all allocated memories.
@@ -96,26 +96,26 @@ private:
     class Node : public RefCount {
     public:
         ~Node();
-        std::pair<void*, int> pointer;
+        std::pair<void*, size_t> pointer;
         SharedPtr<Node> parent = nullptr;
-        int32_t size;
-        int16_t useCount = 0;
+        size_t size;
+        size_t useCount = 0;
         Allocator* outside = nullptr;
     };
 
     typedef std::multimap<size_t, SharedPtr<Node>> FREELIST;
 
     static void returnMemory(FREELIST* list, SharedPtr<Node> node, bool permitMerge = true);
-    std::pair<void*, int> getFromFreeList(FREELIST* list, int size, bool permiteSplit, int align);
+    std::pair<void*, size_t> getFromFreeList(FREELIST* list, size_t size, bool permiteSplit, size_t align);
 
-    std::map<std::pair<void*, int>, SharedPtr<Node>> mUsedList;
+    std::map<std::pair<void*, size_t>, SharedPtr<Node>> mUsedList;
     FREELIST mFreeList;
     size_t mTotalSize   = 0;
 
     FREELIST* mCurrentFreeList = nullptr;
     std::vector<std::shared_ptr<FREELIST>> mGroups;
     std::shared_ptr<Allocator> mAllocator;
-    int mAlign;
+    size_t mAlign;
 };
 } // namespace MNN
 #endif
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index 04a94086..ca12094a 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -18,6 +18,7 @@
 #include "core/Pipeline.hpp"
 #include "core/RuntimeFactory.hpp"
 #include "core/Session.hpp"
+#include <MNN/AutoTime.hpp>
 
 #ifdef MNN_INTERNAL_ENABLED
 #include "internal/auth/ModelAuth.hpp"
@@ -37,6 +38,8 @@ struct Content {
     std::string cacheFile;
     std::mutex lock;
     size_t lastCacheSize = 0;
+    std::string bizCode;
+    std::string uuid;
 };
 
 static void writeCacheFile(const Content *net, std::pair<const void*, size_t> buffer) {
@@ -215,6 +218,9 @@ ErrorCode Interpreter::updateCacheFile(Session *session, int flag) {
 Interpreter::Interpreter(Content* net) {
     MNN_ASSERT(nullptr != net);
     mNet = net;
+    // Store bizcode and uuid because we need them even after `releaseModel` is called.
+    mNet->bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : "");
+    mNet->uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : "");
 }
 
 Interpreter::~Interpreter() {
@@ -296,12 +302,10 @@ Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>&
     mNet->sessions.emplace_back(std::move(newSession));
 
 #ifdef MNN_INTERNAL_ENABLED
-    std::string bizCode = std::string(mNet->net->bizCode() ? mNet->net->bizCode()->c_str() : "");
-    std::string uuid = std::string(mNet->net->mnn_uuid() ? mNet->net->mnn_uuid()->c_str() : "");
     std::map<std::string, std::string> metrics;
-    metrics.emplace("Model_UUID", uuid);
-    metrics.emplace("Model_BizCode", bizCode);
-    metrics.emplace("Event", "CreateSession");
+    metrics.emplace("Model_UUID", mNet->uuid);
+    metrics.emplace("Model_BizCode", mNet->bizCode);
+    metrics.emplace("Event", "CREATE_SESSION");
     metrics.emplace("Backend", std::to_string(configs[0].type));
     metrics.emplace("Precision", configs[0].backendConfig ? std::to_string(configs[0].backendConfig->precision) : "");
     metrics.emplace("API", "Interpreter::createMultiPathSession");
@@ -342,7 +346,32 @@ bool Interpreter::releaseSession(Session* session) {
 }
 
 ErrorCode Interpreter::runSession(Session* session) const {
-    return session->run();
+    Timer timer;
+    ErrorCode errorcode = session->run();
+
+#ifdef MNN_INTERNAL_ENABLED
+    int backendType[MNN_FORWARD_ALL] ;
+    session->getInfo(MNN::Interpreter::BACKENDS, backendType);
+
+    // Only log the performance of CPU backend inference.
+    if (backendType[0] == MNN_FORWARD_CPU) {
+        float costTime = (float)timer.durationInUs() / (float)1000;
+        std::map<std::string, std::string> metrics;
+        metrics.emplace("Model_UUID", mNet->uuid);
+        metrics.emplace("Model_BizCode", mNet->bizCode);
+        metrics.emplace("Event", "RUN_SESSION");
+        metrics.emplace("Backend", std::to_string(MNN_FORWARD_CPU)); // "Precision" is not logged here. Don't need it.
+        metrics.emplace("InferTimeMs", std::to_string(costTime));
+        metrics.emplace("ErrorCode", std::to_string(errorcode));
+        metrics.emplace("API", "Interpreter::runSession");
+        auto basicMetrics = getBasicLoggingData();
+        metrics.insert(basicMetrics.begin(), basicMetrics.end());
+        logAsync(metrics);
+       return errorcode;
+   }
+#endif // MNN_INTERNAL_ENABLED
+
+    return errorcode;
 }
 
 Tensor* Interpreter::getSessionInput(const Session* session, const char* name) {
@@ -405,7 +434,33 @@ ErrorCode Interpreter::runSessionWithCallBack(const Session* session, const Tens
 
 ErrorCode Interpreter::runSessionWithCallBackInfo(const Session* session, const TensorCallBackWithInfo& before,
                                                   const TensorCallBackWithInfo& callBack, bool sync) const {
-    return session->runWithCallBack(before, callBack, sync);
+
+    Timer timer;
+    ErrorCode errorcode = session->runWithCallBack(before, callBack, sync);
+
+#ifdef MNN_INTERNAL_ENABLED
+    int backendType[MNN_FORWARD_ALL];
+    session->getInfo(MNN::Interpreter::BACKENDS, backendType);
+
+    // Only log the performance of CPU backend inference.
+    if (backendType[0] == MNN_FORWARD_CPU) {
+        float costTime = (float)timer.durationInUs() / (float)1000;
+        std::map<std::string, std::string> metrics;
+        metrics.emplace("Model_UUID", mNet->uuid);
+        metrics.emplace("Model_BizCode", mNet->bizCode);
+        metrics.emplace("Event", "RUN_SESSION");
+        metrics.emplace("Backend", std::to_string(MNN_FORWARD_CPU)); // "Precision" is not logged here. Don't need it.
+        metrics.emplace("InferTimeMs", std::to_string(costTime));
+        metrics.emplace("ErrorCode", std::to_string(errorcode));
+        metrics.emplace("API", "Interpreter::runSessionWithCallBackInfo");
+        auto basicMetrics = getBasicLoggingData();
+        metrics.insert(basicMetrics.begin(), basicMetrics.end());
+        logAsync(metrics);
+        return errorcode;
+    }
+#endif // MNN_INTERNAL_ENABLED
+
+    return errorcode;
 }
 
 const Backend* Interpreter::getBackend(const Session* session, const Tensor* tensor) const {
@@ -461,8 +516,11 @@ void Interpreter::resizeTensor(Tensor* tensor, const std::vector<int>& dims) {
 }
 
 const char* Interpreter::bizCode() const {
-    const flatbuffers::String* code = mNet->net->bizCode();
-    return code ? code->c_str() : "";
+    return mNet->bizCode.c_str();
+}
+
+const char* Interpreter::uuid() const {
+    return mNet->uuid.c_str();
 }
 
 std::pair<const void*, size_t> Interpreter::getModelBuffer() const {
diff --git a/source/cv/ImageProcess.cpp b/source/cv/ImageProcess.cpp
index eab060b9..c0796632 100644
--- a/source/cv/ImageProcess.cpp
+++ b/source/cv/ImageProcess.cpp
@@ -65,16 +65,6 @@ ImageProcess::ImageProcess(const Config& config) {
 
 ImageProcess* ImageProcess::create(const Config& config, const Tensor* dstTensor) {
     // TODO Get dstTensor' backend
-    #ifdef _MSC_VER
-        auto cpuFlags = libyuv::InitCpuFlags();
-        bool support = true;
-        support = support && (cpuFlags & libyuv::kCpuHasSSSE3); // _mm_shuffle_epi8
-        support = support && (cpuFlags & libyuv::kCpuHasSSE41); // _mm_cvtepu8_epi32
-        if (!support) {
-            MNN_ERROR("CPU must support SSSE3 and SSE4.1 for using ImageProcess\n");
-            return nullptr;
-        }
-    #endif
     return new ImageProcess(config);
 }
 
@@ -192,12 +182,23 @@ ErrorCode ImageProcess::convert(const uint8_t* source, int iw, int ih, int strid
     if (0 == oc) {
         oc = _getBpp(mInside->config.destFormat);
     }
-    auto ins = { createImageTensor(halide_type_of<uint8_t>(), iw, ih, ic, (void*)source) };
-    auto outs = { createImageTensor(type, ow, oh, oc, dest) };
+    std::unique_ptr<Tensor> input(createImageTensor(halide_type_of<uint8_t>(), iw, ih, ic, (void*)source)),
+                            output(createImageTensor(type, ow, oh, oc, dest));
+    auto ins = { input.get() };
+    auto outs = { output.get() };
     mInside->execution->setPadVal(this->mPaddingValue);
     mInside->execution->onResize(ins, outs);
     mInside->execution->onExecute(ins, outs);
     return NO_ERROR;
 }
+
+void ImageProcess::draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color) {
+    std::unique_ptr<Tensor> imgTensor(createImageTensor(halide_type_of<uint8_t>(), w, h, c, (void*)img)),
+                            regionTensor(Tensor::create(std::vector<int>{num, 3}, halide_type_of<int>(), (void*)regions)),
+                            colorTensor(Tensor::create(std::vector<int>{c}, halide_type_of<uint8_t>(), (void*)color));
+    auto ins = { imgTensor.get(), regionTensor.get(), colorTensor.get() };
+    mInside->execution->onResize(ins, {});
+    mInside->execution->onExecute(ins, {});
+}
 } // namespace CV
 } // namespace MNN
diff --git a/source/geometry/GeometryGather.cpp b/source/geometry/GeometryGather.cpp
index a7d31d12..9c49ef75 100644
--- a/source/geometry/GeometryGather.cpp
+++ b/source/geometry/GeometryGather.cpp
@@ -146,10 +146,10 @@ public:
         auto size = (int*)rgcmd->size()->data();
         size[0] = outside;
         size[2] = inside;
-        auto view0Stride = (int*)rgcmd->view()->GetAs<View>(0)->stride();
+        auto view0Stride = (int*)rgcmd->view()->GetAs<View>(0)->stride()->data();
         view0Stride[0] = inside * N;
         view0Stride[1] = inside;
-        auto view1Stride = (int*)rgcmd->view()->GetAs<View>(1)->stride();
+        auto view1Stride = (int*)rgcmd->view()->GetAs<View>(1)->stride()->data();
         view1Stride[0] = inside * params->length(axis);
         view1Stride[1] = inside;
         return true;
diff --git a/source/geometry/GeometryOPRegister.cpp b/source/geometry/GeometryOPRegister.cpp
index fcd7163b..3a6c414d 100644
--- a/source/geometry/GeometryOPRegister.cpp
+++ b/source/geometry/GeometryOPRegister.cpp
@@ -13,7 +13,6 @@ extern void ___GeometryBroadcastTo___create__();
 extern void ___GeometryConvert___create__();
 extern void ___GeometryCosineSimilarity___create__();
 extern void ___GeometryImageOp___create__();
-extern void ___GeometryGather___create__();
 extern void ___GeometryCrop___create__();
 extern void ___GeometryStridedSlice___create__();
 extern void ___GeometrySelect___create__();
@@ -53,7 +52,6 @@ ___GeometryBroadcastTo___create__();
 ___GeometryConvert___create__();
 ___GeometryCosineSimilarity___create__();
 ___GeometryImageOp___create__();
-___GeometryGather___create__();
 ___GeometryCrop___create__();
 ___GeometryStridedSlice___create__();
 ___GeometrySelect___create__();
diff --git a/source/geometry/GeometrySelect.cpp b/source/geometry/GeometrySelect.cpp
index 64e2f3eb..d9a0c120 100644
--- a/source/geometry/GeometrySelect.cpp
+++ b/source/geometry/GeometrySelect.cpp
@@ -26,7 +26,7 @@ public:
         if (outputSize != inputL0) {
             std::shared_ptr<Tensor> newTensor(new Tensor);
             TensorUtils::copyShape(output, newTensor.get(), true);
-            newTensor->buffer().type = output->buffer().type;
+            newTensor->buffer().type = input0->buffer().type;
             ConvertUtils::broadcastto(input0, newTensor.get());
             input0 = newTensor.get();
             res.extras.emplace_back(newTensor);
diff --git a/source/geometry/GeometryShape.cpp b/source/geometry/GeometryShape.cpp
index e37e20db..2f31becd 100644
--- a/source/geometry/GeometryShape.cpp
+++ b/source/geometry/GeometryShape.cpp
@@ -221,6 +221,47 @@ public:
     }
 };
 
+class GeometryRaster : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        auto extra  = op->main_as_Extra();
+        if (!extra) {
+            return true;
+        }
+        auto output     = outputs[0];
+        auto outputDes  = TensorUtils::getDescribe(output);
+        outputDes->regions.resize(inputs.size());
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < extra->attr()->size(); i++) {
+            auto attr = extra->attr()->Get(i);
+            if (attr->key()->str() == "region") {
+                int len = attr->list()->i()->size();
+                MNN_ASSERT(inputs.size() * 11 == len);
+
+                for (int j = 0; j < inputs.size(); j++) {
+                    auto& region = outputDes->regions[j];
+#define _GET(x) attr->list()->i()->Get(j * 11 + x)
+                    region.src.offset = _GET(0);
+                    region.src.stride[0] = _GET(1);
+                    region.src.stride[1] = _GET(2);
+                    region.src.stride[2] = _GET(3);
+                    region.dst.offset = _GET(4);
+                    region.dst.stride[0] = _GET(5);
+                    region.dst.stride[1] = _GET(6);
+                    region.dst.stride[2] = _GET(7);
+                    region.size[0] = _GET(8);
+                    region.size[1] = _GET(9);
+                    region.size[2] = _GET(10);
+                    region.origin = inputs[j];
+#undef _GET
+                }
+            }
+        }
+        return true;
+    }
+};
+
 static void _create() {
     std::shared_ptr<GeometryComputer> comp(new GeometryShape);
     GeometryComputer::registerGeometryComputer(comp, {OpType_Shape});
@@ -230,6 +271,8 @@ static void _create() {
     GeometryComputer::registerGeometryComputer(comp2, {OpType_PriorBox});
     std::shared_ptr<GeometryComputer> comp3(new GeometrySize);
     GeometryComputer::registerGeometryComputer(comp3, {OpType_Size});
+    std::shared_ptr<GeometryComputer> comp4(new GeometryRaster);
+    GeometryComputer::registerGeometryComputer(comp4, {OpType_Raster});
 }
 
 REGISTER_GEOMETRY(GeometryShape, _create);
diff --git a/source/geometry/GeometryStridedSlice.cpp b/source/geometry/GeometryStridedSlice.cpp
index 15185163..532a1ec3 100644
--- a/source/geometry/GeometryStridedSlice.cpp
+++ b/source/geometry/GeometryStridedSlice.cpp
@@ -9,6 +9,7 @@
 #include "geometry/GeometryComputer.hpp"
 #include "core/OpCommonUtils.hpp"
 #include "core/Macro.h"
+#include "ConvertUtils.hpp"
 namespace MNN {
 class GeometryStridedSlice : public GeometryComputer {
 public:
@@ -247,6 +248,31 @@ public:
             reg.dst.stride[1] = reg.size[2];
             reg.dst.stride[2] = 1;
         }
+        if (inputs.size() == 5) {
+            auto write = inputs[4];
+            std::vector<int> shape(outputShape, outputShape + shapeNum);
+            if (write->shape() != shape) {
+                std::shared_ptr<Tensor> newTensor(new Tensor);
+                newTensor->buffer().type = write->buffer().type;
+                newTensor->buffer().dimensions = shapeNum;
+                for (int i = 0; i < shapeNum; i++) {
+                    newTensor->setLength(i, outputShape[i]);
+                }
+                ConvertUtils::broadcastto(write, newTensor.get());
+                write = newTensor.get();
+                res.extras.emplace_back(newTensor);
+            }
+            for (auto& reg : outputDes->regions) {
+                auto tmp = reg.dst;
+                reg.dst = reg.src;
+                reg.src = tmp;
+                reg.origin = write;
+            }
+            Tensor::InsideDescribe::Region region;
+            region.size[2] = input->elementSize();
+            region.origin = input;
+            outputDes->regions.insert(outputDes->regions.begin(), region);
+        }
         return true;
     }
 };
diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp
index ea3d8590..6473a9db 100644
--- a/source/shape/ShapeRegister.cpp
+++ b/source/shape/ShapeRegister.cpp
@@ -1,6 +1,7 @@
 // This file is generated by Shell for ops register
 namespace MNN {
 extern void ___ShapeSizeComputer__OpType_Shape__();
+extern void ___ShapeRasterComputer__OpType_Raster__();
 extern void ___PriorBoxComputer__OpType_PriorBox__();
 extern void ___ShapeBroadcastTo__OpType_BroadcastTo__();
 extern void ___InterpComputer__OpType_Interp__();
@@ -106,6 +107,7 @@ extern void ___DeconvolutionSizeComputer__OpType_DeconvolutionDepthwise__();
 
 void registerShapeOps() {
 ___ShapeSizeComputer__OpType_Shape__();
+___ShapeRasterComputer__OpType_Raster__();
 ___PriorBoxComputer__OpType_PriorBox__();
 ___ShapeBroadcastTo__OpType_BroadcastTo__();
 ___InterpComputer__OpType_Interp__();
diff --git a/source/shape/ShapeReshape.cpp b/source/shape/ShapeReshape.cpp
index d4389d86..a5bb9631 100644
--- a/source/shape/ShapeReshape.cpp
+++ b/source/shape/ShapeReshape.cpp
@@ -100,9 +100,7 @@ public:
         int totalSizeInput  = 1;
         for (int i = 0; i < input->buffer().dimensions; ++i) {
             auto l = input->length(i);
-            if (l != 0) {
-                totalSizeInput *= l;
-            }
+            totalSizeInput *= l;
         }
 
         int determinAxis = -1;
diff --git a/source/shape/ShapeResize.cpp b/source/shape/ShapeResize.cpp
index 6ac9f16c..9bfae8a7 100644
--- a/source/shape/ShapeResize.cpp
+++ b/source/shape/ShapeResize.cpp
@@ -39,8 +39,14 @@ class ResizeComputer : public SizeComputer {
 class ImageProcessComputer : public SizeComputer {
     virtual bool onComputeSize(const MNN::Op *op, const std::vector<Tensor *> &inputs,
                                const std::vector<Tensor *> &outputs) const override {
-        MNN_ASSERT(1 == inputs.size());
+        MNN_ASSERT(1 == inputs.size() || inputs.size() == 3);
         MNN_ASSERT(1 == outputs.size());
+        if (inputs.size() == 3) {
+            auto &output = outputs[0]->buffer();
+            output.dimensions = 1;
+            output.dim[0].extent = 1;
+            return true;
+        }
 
         // copy dims
         auto &input  = inputs[0]->buffer();
diff --git a/source/shape/ShapeScatterNd.cpp b/source/shape/ShapeScatterNd.cpp
index 68bc107c..97d58bad 100644
--- a/source/shape/ShapeScatterNd.cpp
+++ b/source/shape/ShapeScatterNd.cpp
@@ -15,7 +15,7 @@ namespace MNN {
 class ShapeScatterNd : public SizeComputer {
     bool onComputeSize(const MNN::Op *op, const std::vector<Tensor *> &inputs,
                        const std::vector<Tensor *> &outputs) const override {
-        MNN_ASSERT(3 == inputs.size());
+        MNN_ASSERT(3 <= inputs.size());
         auto indices = inputs[0];
         auto updates = inputs[1];
         auto shape   = inputs[2];
diff --git a/source/shape/ShapeShape.cpp b/source/shape/ShapeShape.cpp
index 3ef775df..eadc18f6 100644
--- a/source/shape/ShapeShape.cpp
+++ b/source/shape/ShapeShape.cpp
@@ -35,4 +35,32 @@ class ShapeSizeComputer : public SizeComputer {
 };
 
 REGISTER_SHAPE(ShapeSizeComputer, OpType_Shape);
+
+class ShapeRasterComputer : public SizeComputer {
+    virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
+                               const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(1 <= inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        outputs[0]->buffer().type = inputs[0]->buffer().type;
+        auto extra  = op->main_as_Extra();
+        if (!extra) {
+            // copy dims
+            TensorUtils::copyShape(inputs[0], outputs[0], true);
+        } else {
+            for (int i = 0; i < extra->attr()->size(); i++) {
+                auto attr = extra->attr()->Get(i);
+                if (attr->key()->str() == "shape") {
+                    int len = attr->list()->i()->size();
+                    outputs[0]->buffer().dimensions = len;
+                    for (int j = 0; j < len; j++) {
+                        outputs[0]->setLength(j, attr->list()->i()->Get(j));
+                    }
+                }
+            }
+        }
+        return true;
+    }
+};
+
+REGISTER_SHAPE(ShapeRasterComputer, OpType_Raster);
 } // namespace MNN
diff --git a/source/shape/ShapeStridedSlice.cpp b/source/shape/ShapeStridedSlice.cpp
index 891420f1..1cdfaf6a 100644
--- a/source/shape/ShapeStridedSlice.cpp
+++ b/source/shape/ShapeStridedSlice.cpp
@@ -16,7 +16,14 @@ class StridedSliceComputer : public SizeComputer {
 public:
     virtual bool onComputeSize(const MNN::Op *op, const std::vector<Tensor *> &inputs,
                                const std::vector<Tensor *> &outputs) const override {
-        MNN_ASSERT(4 == inputs.size());
+        // write to input
+        if (inputs.size() == 5) {
+            TensorUtils::copyShape(inputs[0], outputs[0], true);
+            outputs[0]->buffer().type = inputs[0]->buffer().type;
+            TensorUtils::getDescribe(outputs[0])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+            return true;
+        }
+        MNN_ASSERT(4 <= inputs.size());
         MNN_ASSERT(1 == outputs.size());
         
         Tensor *input            = inputs[0];
diff --git a/source/shape/ShapeWhere.cpp b/source/shape/ShapeWhere.cpp
index 26c00d2b..d7aa1f72 100644
--- a/source/shape/ShapeWhere.cpp
+++ b/source/shape/ShapeWhere.cpp
@@ -39,8 +39,10 @@ class WhereSizeComputer : public SizeComputer {
             // support old version
             return true;
         }
-        // For compability
+        // For zeroshape input
         if (nullptr == inputs[0]->host<void>()) {
+            ob.dimensions = 1;
+            ob.dim[0].extent = 0;
             return true;
         }
         int count = 0;
@@ -56,6 +58,9 @@ class WhereSizeComputer : public SizeComputer {
 
         if (count > 0) {
             ob.dim[0].extent = count;
+        } else {
+            ob.dimensions = 1;
+            ob.dim[0].extent = 0;
         }
         return true;
     }
diff --git a/test.bat b/test.bat
new file mode 100644
index 00000000..85b21099
--- /dev/null
+++ b/test.bat
@@ -0,0 +1,7 @@
+if %1 EQU x86 (
+    @call "%vs_env_setup%/vcvarsamd64_x86.bat"
+    powershell "%~dp0test.ps1" -gpu -x86
+) else (
+    @call "%vs_env_setup%/vcvars64.bat"
+    powershell "%~dp0test.ps1" -gpu
+)
\ No newline at end of file
diff --git a/test.ps1 b/test.ps1
new file mode 100644
index 00000000..3799f5ed
--- /dev/null
+++ b/test.ps1
@@ -0,0 +1,233 @@
+﻿# Powershell Script must be save as UTF-8 with BOM, otherwise system-wide code page will be used, causing garbled code
+
+# MNN-CPU-GPU
+#  |-- include
+#  |-- lib
+#  |    |-- x64
+#  |    |    |-- (Debug/Release x Dynamic/Static x MD/MT)
+#  |    |
+#  |    |-- x86
+#  |         |-- (Debug/Release x Dynamic/Static x MD/MT)
+#  |
+#  |-- tools (Release + Dynamic + MD)
+#  |    |-- x64
+#  |    |-- x86
+#  |
+#  |-- py_whl
+#  |-- py_bridge
+#       |-- include
+#       |-- wrapper
+#       |-- test (Release + Dynamic + MD)
+#            |-- x64
+#            |-- x86
+#       |-- lib
+#            |-- x64
+#            |    |-- (Debug/Release x Dynamic/Static x MD/MT)
+#            |
+#            |-- x86
+#                 |-- (Debug/Release x Dynamic/Static x MD/MT)
+
+Param(
+    [Switch]$gpu,
+    [Switch]$x86
+)
+
+$basedir = $(Split-Path -Parent $MyInvocation.MyCommand.Path)
+$outdir = "$basedir/$(If ($gpu) {"MNN-CPU-GPU"} Else {"MNN-CPU"})"
+$arch = "$(If ($x86) {"x86"} Else {"x64"})"
+Write-Output $arch
+
+$test_avx512 = ((!$x86) -and $env:avx512_server -and $env:avx512_password)
+if ($test_avx512) {
+    $remote_home = $(Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password powershell "echo `$HOME"')
+    $remote_dir = "${remote_home}\cise-space\$(Split-Path -Path $(pushd .. ; pwd ; popd) -Leaf)"
+}
+function sync_remote() {
+    Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password powershell "Remove-Item -Recurse $remote_dir -ErrorAction Ignore ; mkdir $remote_dir"'
+    Invoke-Expression 'pscp -pw $env:avx512_password -r $outdir/tools ${env:avx512_server}:${remote_dir}'
+    Invoke-Expression 'pscp -pw $env:avx512_password tools/script/modelTest.py ${env:avx512_server}:${remote_dir}'
+}
+
+function run_remote([String]$cmd) {
+    $tmpfile = New-TemporaryFile
+    Set-Content -Path $tmpfile -Value "powershell `"cd ${remote_dir} ;  $cmd`""
+    $output = $(Invoke-Expression 'plink -batch -ssh $env:avx512_server -pw $env:avx512_password -m $tmpfile')
+    Remove-Item $tmpfile
+    return $output
+}
+
+function log($case, $title, $blocked, $failed, $passed, $skipped) {
+    Write-Output "TEST_NAME_${case}: $title\nTEST_CASE_AMOUNT_${case}: {`"blocked`":$blocked,`"failed`":$failed,`"passed`":$passed,`"skipped`":$skipped}\n"
+}
+
+function failed() {
+    Write-Output "TEST_NAME_EXCEPTION: Exception"
+    Write-Output 'TEST_CASE_AMOUNT_EXCEPTION: {"blocked":0,"failed":1,"passed":0,"skipped":0}'
+    exit 1
+}
+
+function build_lib_test() {
+    Invoke-Expression "./package_scripts/win/build_lib.ps1 -path $outdir $(If ($gpu) {"-backends 'opencl,vulkan'"}) $(If ($x86) {'-x86'})"
+    $WrongNum = $($LastExitCode -ne 0)
+    log "WINDOWS_LIB" "Windows主库编译测试" 0 $WrongNum $(1 - $WrongNum) 0
+    if ($WrongNum -ne 0) {
+        Write-Output "### Windows主库编译测试失败，测试终止"
+        failed
+    }
+}
+
+function build_tool_test() {
+    Invoke-Expression "./package_scripts/win/build_tools.ps1 -path $outdir/tools/$arch $(If ($gpu) {"-backends 'opencl,vulkan'"}) -build_all -dynamic_link"
+    $WrongNum = $($LastExitCode -ne 0)
+    log "WINDOWS_LIB" "Windows工具编译测试" 0 $WrongNum $(1 - $WrongNum) 0
+    if ($WrongNum -ne 0) {
+        Write-Output "### Windows工具编译测试失败，测试终止"
+        failed
+    }
+}
+
+function build_whl_test() {
+    $pyenvs = "py27,py37,py38,py39"
+    if ($x86) {
+        $pyenvs = "py27-win32,py37-win32,py38-win32,py39-win32"
+    }
+    Invoke-Expression "./package_scripts/win/build_whl.ps1 -version ci_test -path $outdir/py_whl -pyenvs '$pyenvs' $(If ($x86) {'-x86'})"
+    $WrongNum = $($LastExitCode -ne 0)
+    log "WINDOWS_LIB" "Windows pymnn wheel编译测试" 0 $WrongNum $(1 - $WrongNum) 0
+    if ($WrongNum -ne 0) {
+        Write-Output "### Windows pymnn wheel编译测试失败，测试终止"
+        failed
+    }
+}
+
+function build_bridge_test() {
+    Invoke-Expression "./package_scripts/win/build_bridge.ps1 -version ci_test -pyc_env py27 -mnn_path $outdir -python_path $HOME/PyBridgeDeps/python -numpy_path $HOME/PyBridgeDeps/numpy -path $outdir/py_bridge -train_api $(If ($x86) {'-x86'})"
+    $WrongNum = $($LastExitCode -ne 0)
+    log "WINDOWS_LIB" "Windows pymnn bridge编译测试" 0 $WrongNum $(1 - $WrongNum) 0
+    if ($WrongNum -ne 0) {
+        Write-Output "### Windows pymnn bridge编译测试失败，测试终止"
+        failed
+    }
+}
+
+function unit_test() {
+    Invoke-Expression "$outdir/tools/$arch/run_test.out.exe"
+    if ($LastExitCode -ne 0) {
+        Write-Output "### CPU后端 单元测试失败，测试终止"
+        failed
+    }
+    Invoke-Expression "$outdir/tools/$arch/run_test.out.exe op 0 0 4"
+    if ($LastExitCode -ne 0) {
+        Write-Output "### CPU后端 多线程测试失败，测试终止"
+        failed
+    }
+    if ($test_avx512) {
+        $RemoteExitCode = run_remote "cd tools/x64 ; ./run_test.out.exe > log.txt ; echo `$LastExitCode"
+        Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt")
+        if ($RemoteExitCode -ne 0) {
+            Write-Output "### CPU后端(AVX512) 单元测试失败，测试终止"
+            failed
+        }
+        $RemoteExitCode = run_remote "cd tools/x64 ; ./run_test.out.exe op 0 0 4 > log.txt ; echo `$LastExitCode"
+        Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt")
+        if ($RemoteExitCode -ne 0) {
+            Write-Output "### CPU后端(AVX512) 多线程测试失败，测试终止"
+            failed
+        }
+    }
+    #Invoke-Expression "$outdir/tools/$arch/run_test.out.exe op 3"
+    #if ($LastExitCode -ne 0) {
+    #    echo "### OpenCL后端 单元测试失败，测试终止"
+    #    failed
+    #}
+}
+
+function model_test() {
+    Push-Location $outdir/tools/$arch
+    python $basedir/tools/script/modelTest.py $HOME/AliNNModel 0 0.002
+    if ($LastExitCode -ne 0) {
+        Write-Output "### CPU后端 模型测试失败，测试终止"
+        Pop-Location
+        failed
+    }
+    python $basedir/tools/script/modelTest.py $HOME/AliNNModel 0 0.002 0 1
+    if ($LastExitCode -ne 0) {
+        Write-Output "### CPU后端 静态模型测试失败，测试终止"
+        Pop-Location
+        failed
+    }
+    if ($test_avx512) {
+        $RemoteExitCode = run_remote "cd tools/x64 ; python ../../modelTest.py `$HOME/AliNNModel 0 0.002 > log.txt ; echo `$LastExitCode"
+        Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt")
+        if ($RemoteExitCode -ne 0) {
+            Write-Output "### CPU后端(AVX512) 模型测试失败，测试终止"
+            Pop-Location
+            failed
+        }
+        $RemoteExitCode = run_remote "cd tools/x64 ; python ../../modelTest.py `$HOME/AliNNModel 0 0.002 0 1 > log.txt ; echo `$LastExitCode"
+        Write-Output $(run_remote "Get-Content -Path tools/x64/log.txt")
+        if ($RemoteExitCode -ne 0) {
+            Write-Output "### CPU后端(AVX512) 静态模型测试失败，测试终止"
+            Pop-Location
+            failed
+        }
+    }
+    #python $basedir/tools/script/modelTest.py $HOME/AliNNModel 3 0.01
+    #if ($LastExitCode -ne 0) {
+    #    echo "### OpenCL后端 模型测试失败，测试终止"
+    #    Pop-Location
+    #    failed
+    #}
+    Pop-Location
+}
+
+function pymnn_whl_test() {
+    $pyarch = $(If ($x86) {"win32"} Else {"amd64"})
+    Push-Location pymnn/test
+    $local = "$(Get-Location)/aone-site-packages"
+    $pythonpath_backup = ${env:PYTHONPATH}
+    Foreach ($pyenv in @("27", "37", "38", "39")) {
+        Invoke-Expression "conda activate py$pyenv$(If($x86) {'-win32'})"
+        Remove-Item -Recurse $local -ErrorAction Ignore
+        pip install --target $local $outdir/py_whl/$(Get-ChildItem -Path $outdir/py_whl -Include "*$pyenv*$pyarch*" -Name)
+        do {
+            # unit_test.py need torch, which isn't support on 32bit Windows and py27
+            # https://pytorch.org/docs/stable/notes/windows.html#package-not-found-in-win-32-channel
+            if ($x86 -or ($pyenv -eq "27")) {
+                break;
+            }
+            ${env:PYTHONPATH} = $local
+            python unit_test.py
+            ${env:PYTHONPATH} = $pythonpath_backup
+            if ($LastExitCode -ne 0) {
+                Write-Output "### PYMNN单元测试失败，测试终止"
+                conda deactivate
+                Pop-Location
+                failed
+            }
+        } while(0);
+        ${env:PYTHONPATH} = "$local"
+        python model_test.py $HOME/AliNNModel
+        ${env:PYTHONPATH} = $pythonpath_backup
+        if ($LastExitCode -ne 0) {
+            Write-Output "### PYMNN模型测试失败，测试终止"
+            conda deactivate
+            Pop-Location
+            failed
+        }
+        conda deactivate
+    }
+    Pop-Location
+}
+
+build_lib_test
+build_tool_test
+build_whl_test
+build_bridge_test
+
+if ($test_avx512) {
+    sync_remote
+}
+unit_test
+model_test
+pymnn_whl_test
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 40f839d2..6e05feaf 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,6 +12,9 @@ endif()
 
 add_executable(run_test.out ${Files})
 target_link_libraries(run_test.out ${MNN_DEPS})
+if (WIN32)
+  target_compile_options(run_test.out PRIVATE /bigobj)
+endif()
 if (MNN_SUPPORT_BF16)
     target_compile_options(run_test.out PRIVATE -DMNN_SUPPORT_BF16)
 endif()
diff --git a/test/MNNTestSuite.cpp b/test/MNNTestSuite.cpp
index 3544e969..fa039b52 100644
--- a/test/MNNTestSuite.cpp
+++ b/test/MNNTestSuite.cpp
@@ -34,9 +34,9 @@ static void printTestResult(int wrong, int right, const char* flag) {
     printf("{\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n", wrong, right);
 }
 
-void MNNTestSuite::run(const char* key, int precision, const char* flag) {
+int MNNTestSuite::run(const char* key, int precision, const char* flag) {
     if (key == NULL || strlen(key) == 0)
-        return;
+        return 0;
 
     auto suite         = MNNTestSuite::get();
     std::string prefix = key;
@@ -60,9 +60,10 @@ void MNNTestSuite::run(const char* key, int precision, const char* flag) {
         printf("Error: %s\n", wrong.c_str());
     }
     printTestResult(wrongs.size(), runUnit - wrongs.size(), flag);
+    return wrongs.size();
 }
 
-void MNNTestSuite::runAll(int precision, const char* flag) {
+int MNNTestSuite::runAll(int precision, const char* flag) {
     auto suite = MNNTestSuite::get();
     std::vector<std::string> wrongs;
     for (int i = 0; i < suite->mTests.size(); ++i) {
@@ -88,4 +89,5 @@ void MNNTestSuite::runAll(int precision, const char* flag) {
         printf("Error: %s\n", wrong.c_str());
     }
     printTestResult(wrongs.size(), suite->mTests.size() - wrongs.size(), flag);
+    return wrongs.size();
 }
diff --git a/test/MNNTestSuite.h b/test/MNNTestSuite.h
index 568b6dde..c67cdbfa 100644
--- a/test/MNNTestSuite.h
+++ b/test/MNNTestSuite.h
@@ -21,6 +21,7 @@
 #include <Windows.h>
 #undef min
 #undef max
+#undef NO_ERROR
 #else
 #include <sys/time.h>
 #include <sys/stat.h>
@@ -92,13 +93,13 @@ public:
      * @param precision. fp32 / bf16 precision should use FP32Converter[1 - 2].
      * fp16 precision should use FP32Converter[3].
      */
-    static void runAll(int precision, const char* flag = "");
+    static int runAll(int precision, const char* flag = "");
     /**
      * @brief run test case with runtime precision, see FP32Converter in TestUtil.h.
      * @param precision. fp32 / bf16 precision should use FP32Converter[1 - 2].
      * fp16 precision should use FP32Converter[3].
      */
-    static void run(const char* name, int precision, const char* flag = "");
+    static int run(const char* name, int precision, const char* flag = "");
 
 private:
     /** get shared instance */
diff --git a/test/core/BackendTest.cpp b/test/core/BackendTest.cpp
index 603c1c31..207e451f 100644
--- a/test/core/BackendTest.cpp
+++ b/test/core/BackendTest.cpp
@@ -148,7 +148,7 @@ bool nhwc_2_nhwc_uint8(std::shared_ptr<Backend> bn) {
     auto backendCopyData = checkHostTensor->host<uint8_t>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
             return false;
         }
@@ -183,7 +183,7 @@ bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr<Backend> bn) {
     auto backendCopyData = checkHostTensor->host<T>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for NCHW Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
             return false;
         }
@@ -195,7 +195,7 @@ bool NC4HW4_2_NC4HW4_IntType(std::shared_ptr<Backend> bn) {
     bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
     bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for NHWC Mid bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
             return false;
         }
@@ -216,7 +216,7 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr<Backend> bn, int batch, int width, int hei
                          + c * height * width
                          + y * width
                          + x
-                         ] = b * 100.f + c * 10.f + y * 0.1f + x * 0.001f;
+                         ] = b / (float)batch * 100.f + c / (float)channel * 10.f + y / (float)height * 0.1f + x / (float)width * 0.001f;
                 }
             }
         }
@@ -231,8 +231,8 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr<Backend> bn, int batch, int width, int hei
     auto backendCopyData = dstTensor->host<float>();
     auto hostData = srcTensor->host<float>();
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
-            MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
+        if (abs(backendCopyData[i] - hostData[i]) >= 0.1f) {
+            MNN_PRINT("Error for bn:%d, %f -> %f, %f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
             return false;
         }
     }
@@ -242,8 +242,8 @@ bool NCHW_NC4HW4_NCHW(std::shared_ptr<Backend> bn, int batch, int width, int hei
 
 bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
 //    MNN_PRINT("\n ========= check NC4HW4_2_NC4HW4_float result ! ========= \n");
-    std::vector<int> nhwc_shape = {1, 224, 224, 8};
-    std::vector<int> nchw_shape = {1, 224, 8, 224};
+    std::vector<int> nhwc_shape = {1, 32, 12, 13};
+    std::vector<int> nchw_shape = {1, 12, 13, 32};
     std::shared_ptr<Tensor> hostTensor(
         Tensor::create<float>(nhwc_shape, nullptr, Tensor::CAFFE_C4));
     auto elementSize = hostTensor->elementSize();
@@ -288,7 +288,7 @@ bool NC4HW4_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
     bn->onCopyBuffer(hostTensor.get(), deviceTensor2.get());
     bn->onCopyBuffer(deviceTensor2.get(), checkHostTensor.get());
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for NHWC Mid bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
             return false;
         }
@@ -319,7 +319,7 @@ void NC4HW4_2_NC4HW4_uint8(std::shared_ptr<Backend> bn) {
     auto backendCopyData = checkHostTensor->host<uint8_t>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], (int32_t)backendCopyData[i]);
             break;
         }
@@ -433,7 +433,7 @@ void nchw_2_NC4HW4_float(std::shared_ptr<Backend> bn) {
 
     //            MNN_PRINT("NC4HW4 -> nhwc !\n");
     for (int i = 0; i < elementSize; ++i) {
-        if (abs(backendCopyData[i] - hostData[i]) >= 0.001) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -464,7 +464,7 @@ void nchw_2_NC4HW4_2_nchw_float(std::shared_ptr<Backend> bn) {
         auto backendCopyData = checkHostTensor->host<float>();
 
         for (int i = 0; i < elementSize; ++i) {
-            if (backendCopyData[i] != hostData[i]) {
+            if (abs(backendCopyData[i] != hostData[i]) >= F32_BF16_MAX_LOSS) {
                 MNN_PRINT("Error for bn:%d, %f -> %f\n", i, hostData[i], backendCopyData[i]);
                 break;
             }
@@ -510,7 +510,7 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
     auto backendCopyData = NC4HW4_HostTensor->host<T>();
 
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
             return false;
         }
@@ -524,9 +524,9 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
     NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
     bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());
 
-    //            MNN_PRINT("NC4HW4 -> nhwc !\n");
+    // MNN_PRINT("NC4HW4 -> nhwc !\n");
     for (int i = 0; i < elementSize; ++i) {
-        if (backendCopyData[i] != hostData[i]) {
+        if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
             MNN_PRINT("Error for bn:%d, %d -> %d\n", i, hostData[i], backendCopyData[i]);
         }
     }
@@ -534,14 +534,53 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
     free(temp);
     return true;
 }
+bool nchwTonhwc(std::shared_ptr<Backend> bn) {
+    // Test NHWC -> NC4HW4 -> NHWC
+    MNN_PRINT("\n ========= check nchwTonhwc result ! ========= \n");
+    int batch   = 2;
+    int channel = 12;
+    int width   = 21;
+    int height  = 5;
+    std::shared_ptr<Tensor> hostTensor(
+        Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
+    auto elementSize = hostTensor->elementSize();
+    auto hostData    = hostTensor->host<float>();
+    for (int i = 0; i < elementSize; ++i) {
+        int flagRandom    = (rand() % 2 == 0);
+        float valueRandom = rand() % 255 / 255.f;
+        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
+    }
+    std::vector<float> tempStorage(hostTensor->elementSize());
+    float* temp = tempStorage.data();
+    memset(temp, 0.0f, hostTensor->size());
+    NCHW2NHWC(hostData, temp, batch, height, width, channel);
+    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
+    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
+    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
+    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
+    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
+    std::shared_ptr<Tensor> hostTensorNHWC(
+        Tensor::create<float>(std::vector<int>{batch, height, width, channel}, nullptr, Tensor::TENSORFLOW));
+    bn->onCopyBuffer(deviceTensor.get(), hostTensorNHWC.get());
+    auto backendCopyData = hostTensorNHWC->host<float>();
+    for (int i = 0; i < elementSize; ++i) {
+        if (abs(backendCopyData[i] - temp[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
+            MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
+            return false;
+        }
+    }
+    return true;
+}
+
 
 bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     // Test NHWC -> NC4HW4 -> NHWC
     MNN_PRINT("\n ========= check nhwc_2_NC4HW4_2_nhwc_float result ! ========= \n");
     int batch   = 1;
     int channel = 12;
-    int width   = 20;
-    int height  = 20;
+    int width   = 3;
+    int height  = 2;
     std::shared_ptr<Tensor> hostTensor(
         Tensor::create<float>(std::vector<int>{batch, channel, height, width}, nullptr, Tensor::CAFFE));
     auto elementSize = hostTensor->elementSize();
@@ -556,15 +595,12 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     memset(temp, 0.0f, hostTensor->size());
     NCHW2NHWC(hostData, temp, batch, height, width, channel);
 
-    std::shared_ptr<Tensor> deviceTensor_pre(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
-    bn->onAcquireBuffer(deviceTensor_pre.get(), Backend::STATIC);
     std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, height, width, channel}));
     bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
-    bn->onCopyBuffer(hostTensor.get(), deviceTensor_pre.get());
-    bn->onCopyBuffer(deviceTensor_pre.get(), deviceTensor.get());
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());
 
-    //            // nhwc -> NC4HW4
-    //            MNN_PRINT("nhwc -> NC4HW4 !\n");
+    // // nhwc -> NC4HW4
+    // MNN_PRINT("nhwc -> NC4HW4 !\n");
 
     MNNTensorConvertNHWCToNC4HW4(hostData, temp, height * width, channel);
     std::shared_ptr<Tensor> NC4HW4_HostTensor(
@@ -573,12 +609,20 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     bn->onCopyBuffer(deviceTensor.get(), NC4HW4_HostTensor.get());
     auto backendCopyData = NC4HW4_HostTensor->host<float>();
 
+    bool res = true;
     for (int i = 0; i < elementSize; ++i) {
         if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
             MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
-            return false;
+            res = false;
+            break;
         }
     }
+    if (!res) {
+        for (int i = 0; i < elementSize; ++i) {
+            MNN_PRINT("%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
+        }
+        return false;
+    }
 
     // NC4HW4 -> nhwc
 
@@ -588,10 +632,11 @@ bool nhwc_2_NC4HW4_2_nhwc_float(std::shared_ptr<Backend> bn) {
     NHWC2NCHW(temp, backendCopyData, batch, height, width, channel);
     bn->onCopyBuffer(deviceTensor.get(), hostTensor.get());
 
-    //            MNN_PRINT("NC4HW4 -> nhwc !\n");
+    MNN_PRINT("NC4HW4 -> nhwc !\n");
     for (int i = 0; i < elementSize; ++i) {
         if (abs(backendCopyData[i] - hostData[i]) >= F32_BF16_MAX_LOSS) {
-            MNN_PRINT("Error for bn:%d, %f -> %f.  F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
+            MNN_PRINT("NC4HW4 -> nhwc Error for bn:%d, %f -> %f.  F32_BF16_MAX_LOSS:%f\n", i, hostData[i], backendCopyData[i], F32_BF16_MAX_LOSS);
+            return false;
         }
     }
 
@@ -618,12 +663,20 @@ public:
                 MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
                 std::shared_ptr<Backend> bn(runtime->onCreate(&user));
                 auto res = NC4HW4_2_NC4HW4_float(bn);
+                FUNC_PRINT(res);
+                res = res && nchwTonhwc(bn);
+                FUNC_PRINT(res);
                 res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
+                FUNC_PRINT(res);
                 res = res && NCHW_NC4HW4_NCHW(bn, 3, 16, 17, 19);
+                FUNC_PRINT(res);
                 res = res && NCHW_NC4HW4_NCHW(bn, 12, 16, 38, 16);
+                FUNC_PRINT(res);
                 res = res && NCHW_NC4HW4_NCHW(bn, 5, 128, 8, 6);
+                FUNC_PRINT(res);
                 if (!res) {
                     MNN_ERROR("Error for %d bn\n", i);
+                    return false;
                 }
             }
         }
diff --git a/test/core/BufferAllocatorTest.cpp b/test/core/BufferAllocatorTest.cpp
index 40c8ef60..eb7d4112 100644
--- a/test/core/BufferAllocatorTest.cpp
+++ b/test/core/BufferAllocatorTest.cpp
@@ -11,7 +11,7 @@
 #include "core/MNNMemoryUtils.h"
 
 using namespace MNN;
-
+#ifndef _MSC_VER
 class BufferAllocatorTest : public MNNTestCase {
 public:
     virtual ~BufferAllocatorTest() = default;
@@ -56,3 +56,4 @@ public:
     }
 };
 MNNTestSuiteRegister(BufferAllocatorTest, "core/buffer_allocator");
+#endif
\ No newline at end of file
diff --git a/test/expr/MatMulTest.cpp b/test/expr/MatMulTest.cpp
index f38d27e4..e768051b 100644
--- a/test/expr/MatMulTest.cpp
+++ b/test/expr/MatMulTest.cpp
@@ -40,7 +40,7 @@ static bool checkMatMul(const float* C, const float* A, const float* B, int e, i
                 expected += AY[k] * BX[k * e];
             }
             auto diff = fabsf(expected - computed);
-            if (diff > 0.1f) {
+            if (diff > 0.003f * fabsf(expected)) {
                 MNN_PRINT("%f -> %f\n", expected, computed);
                 res = false;
             }
diff --git a/test/expr/ZeroShapeTest.cpp b/test/expr/ZeroShapeTest.cpp
index cef801e4..8f6dd9f9 100644
--- a/test/expr/ZeroShapeTest.cpp
+++ b/test/expr/ZeroShapeTest.cpp
@@ -19,9 +19,9 @@ public:
     virtual bool run(int precision) {
         auto input = _Input({1, 0, 4, 1}, NHWC);
         input->setName("input");
-        auto output    = _Reshape(input, {0, 0, -1});
+        auto output    = _Reshape(input, {1, 0, -1});
         auto info      = output->getInfo();
-        auto rightDims = std::vector<int>{1, 0, 4};
+        auto rightDims = std::vector<int>{1, 0, 0};
         if (info->dim[0] != rightDims[0] || info->dim[1] != rightDims[1] || info->dim[2] != rightDims[2]) {
             return false;
         }
diff --git a/test/main.cpp b/test/main.cpp
index 79b7807a..e5d5b4b2 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -48,12 +48,12 @@ int main(int argc, char* argv[]) {
     if (argc > 1) {
         auto name = argv[1];
         if (strcmp(name, "all") == 0) {
-            MNNTestSuite::runAll(precisionInTestUtil, flag);
+            return MNNTestSuite::runAll(precisionInTestUtil, flag);
         } else {
-            MNNTestSuite::run(name, precisionInTestUtil, flag);
+            return MNNTestSuite::run(name, precisionInTestUtil, flag);
         }
     } else {
-        MNNTestSuite::runAll(precisionInTestUtil, flag);
+        return MNNTestSuite::runAll(precisionInTestUtil, flag);
     }
     return 0;
 }
diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp
new file mode 100644
index 00000000..c2f40e4b
--- /dev/null
+++ b/test/op/RasterTest.cpp
@@ -0,0 +1,43 @@
+//
+//  RasrerTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/12/23.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+class RasrerTest : public MNNTestCase {
+public:
+    virtual ~RasrerTest() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({2, 2}, NCHW);
+        input->setName("input_tensor");
+        // set input data
+        const float inpudata[] = {1, 2, 3, 4};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 4 * sizeof(float));
+        // transpose
+        auto output                             = _Raster({input}, {0, 4, 1, 2, 0, 4, 2, 1, 1, 2, 2}, {2, 2});
+        const std::vector<float> expectedOutput = {1, 3, 2, 4};
+        auto gotOutput                          = output->readMap<float>();
+        if (!checkVector<float>(gotOutput, expectedOutput.data(), 4, 0.01)) {
+            MNN_ERROR("RasterTest transpose test failed!\n");
+            return false;
+        }
+        auto output0                             = _Raster({input}, {2, 4, 2, 1, 0, 4, 2, 1, 1, 1, 2}, {2});
+        const std::vector<float> expectedOutput0 = {3, 4};
+        auto gotOutput0                          = output0->readMap<float>();
+        if (!checkVector<float>(gotOutput0, expectedOutput0.data(), 2, 0.01)) {
+            MNN_ERROR("RasterTest slice test failed!\n");
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(RasrerTest, "op/raster");
diff --git a/test/op/SelectTest.cpp b/test/op/SelectTest.cpp
index 170a9fe4..52780150 100644
--- a/test/op/SelectTest.cpp
+++ b/test/op/SelectTest.cpp
@@ -9,6 +9,7 @@
 #include <random>
 
 #include <MNN/expr/Expr.hpp>
+#include <cmath>
 #include <MNN/expr/ExprCreator.hpp>
 #include "MNNTestSuite.h"
 #include "MNN_generated.h"
@@ -45,7 +46,7 @@ void RandInit(VARP value, T lower, T upper) {
 void RandInitBool(VARP value) {
     int* pValue = value->writeMap<int>();
     for (int i = 0; i < Size(value); ++i) {
-        pValue[i] = (uniform_dist(rng) > 0.f);
+        pValue[i] = (uniform_dist(rng) > 0.5f);
     }
 }
 
@@ -68,11 +69,13 @@ bool RunSelectAndCheckResult(VARP select, VARP input0, VARP input1) {
             condition = select->readMap<int>()[i];
         }
         if (condition) {
-            if (input0Ptr[i * iter0] != outputPtr[i]) {
+            if (fabsf(input0Ptr[i * iter0] - outputPtr[i]) >= 0.1f) {
+                MNN_PRINT("%d, %d - %f - %f - %f\n", i, condition, input0Ptr[i * iter0], input1Ptr[i * iter1], outputPtr[i]);
                 return false;
             }
         } else {
-            if (input1Ptr[i * iter1] != outputPtr[i]) {
+            if (fabsf(input1Ptr[i * iter1] - outputPtr[i]) >= 0.1f) {
+                MNN_PRINT("%d, %d - %f - %f - %f\n", i, condition, input0Ptr[i * iter0], input1Ptr[i * iter1], outputPtr[i]);
                 return false;
             }
         }
@@ -84,11 +87,11 @@ bool SelectTester1D(int N) {
     auto input0 = _Input({N}, NCHW);
     auto input1 = _Input({N}, NCHW);
     {
-        auto select = _Input({N}, NCHW);
+        auto select = _Input({N}, NCHW, halide_type_of<int>());
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
     {
-        auto select = _Input({1}, NCHW);
+        auto select = _Input({1}, NCHW, halide_type_of<int>());
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
     return true;
@@ -98,15 +101,15 @@ bool SelectTester4D(int N, int C, int H, int W) {
     auto input0 = _Input({N, C, H, W}, NCHW);
     auto input1 = _Input({N, C, H, W}, NCHW);
     {
-        auto select = _Input({N, C, H, W}, NCHW);
+        auto select = _Input({N, C, H, W}, NCHW, halide_type_of<int>());
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
     {
-        auto select = _Input({1}, NCHW);
+        auto select = _Input({1}, NCHW, halide_type_of<int>());
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
     {
-        auto select = _Input({N, C, H, W}, NCHW);
+        auto select = _Input({N, C, H, W}, NCHW, halide_type_of<int>());
         auto input0 = _Input({1}, NCHW);
         CHECK_OR_RETURN(RunSelectAndCheckResult(select, input0, input1));
     }
diff --git a/test/op/SortTest.cpp b/test/op/SortTest.cpp
new file mode 100644
index 00000000..be6c202d
--- /dev/null
+++ b/test/op/SortTest.cpp
@@ -0,0 +1,92 @@
+//
+//  SortTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/12/22.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+using namespace MNN::Express;
+class SortTest : public MNNTestCase {
+public:
+    virtual ~SortTest() = default;
+    virtual bool run(int precision) {
+        auto input_nhwc = _Input({4, 4}, NHWC);
+        input_nhwc->setName("input_tensor_nhwc");
+        // set input data
+        const float inpudata[] = {-1.0, 2.0,   -3.0, 4.0,
+                                  5.0,  -6.0, 7.0,   -8.0,
+                                  -9.0, -10.0, 11.0, 12.0,
+                                  13.0, 14.0, -15.0, -16.0};
+        auto inputPtr          = input_nhwc->writeMap<float>();
+        memcpy(inputPtr, inpudata, 16 * sizeof(float));
+        const std::vector<float> expectedOutput_0 = {-9.0, -10.0, -15.0, -16.0,
+                                                     -1.0,  -6.0,  -3.0,  -8.0,
+                                                      5.0,   2.0,   7.0,   4.0,
+                                                     13.0,  14.0,  11.0,  12.0};
+        auto output_0                           = _Sort(input_nhwc, 0);
+        auto gotOutput_0                        = output_0->readMap<float>();
+        if (!checkVector<float>(gotOutput_0, expectedOutput_0.data(), 16, 0)) {
+            MNN_ERROR("SortTest test axis_0 failed!\n");
+            return false;
+        }
+        const std::vector<float> expectedOutput_1 = {-3.0, -1.0,  2.0, 4.0,
+                                                     -8.0, -6.0,  5.0, 7.0,
+                                                    -10.0, -9.0,  11.0, 12.0,
+                                                    -16.0, -15.0, 13.0, 14.0};
+        auto output_1                           = _Sort(input_nhwc, 1);
+        auto gotOutput_1                        = output_1->readMap<float>();
+        if (!checkVector<float>(gotOutput_1, expectedOutput_1.data(), 16, 0)) {
+            MNN_ERROR("SortTest test axis_1 failed!\n");
+            return false;
+        }
+        const std::vector<int> expectedOutput_2 = { 2, 2, 3, 3,
+                                                    0, 1, 0, 1,
+                                                    1, 0, 1, 0,
+                                                    3, 3, 2, 2 };
+        auto output_2                           = _Sort(_Clone(input_nhwc, true), 0, true);
+        auto gotOutput_2                        = output_2->readMap<int>();
+        if (!checkVector<int>(gotOutput_2, expectedOutput_2.data(), 16, 0)) {
+            MNN_ERROR("ArgSortTest test axis_0 failed!\n");
+            return false;
+        }
+        const std::vector<int> expectedOutput_3 = { 2, 0, 1, 3,
+                                                    3, 1, 0, 2,
+                                                    1, 0, 2, 3,
+                                                    3, 2, 0, 1 };
+        auto output_3                           = _Sort(_Clone(input_nhwc, true), 1, true);
+        auto gotOutput_3                        = output_3->readMap<int>();
+        if (!checkVector<int>(gotOutput_3, expectedOutput_3.data(), 16, 0)) {
+            MNN_ERROR("ArgSortTest test axis_1 failed!\n");
+            return false;
+        }
+        const std::vector<int> expectedOutput_4 = { 3, 3, 2, 2,
+                                                    1, 0, 1, 0,
+                                                    0, 1, 0, 1,
+                                                    2, 2, 3, 3 };
+        auto output_4                           = _Sort(_Clone(input_nhwc, true), 0, true, true);
+        auto gotOutput_4                        = output_4->readMap<int>();
+        if (!checkVector<int>(gotOutput_4, expectedOutput_4.data(), 16, 0)) {
+            MNN_ERROR("ArgSortTest test axis_0, descend failed!\n");
+            return false;
+        }
+        auto input_nchw = _Input({5}, NC4HW4);
+        inputPtr          = input_nchw->writeMap<float>();
+        const float inpudatax[] = { 0.4, 0.2, 0.5, 0.1, 0.3 };
+        memcpy(inputPtr, inpudatax, 5 * sizeof(float));
+        auto output_5 = _Sort(input_nchw, 0, true);
+        auto gotOutput_5 = output_5->readMap<int>();
+        const std::vector<int> expectedOutput_5 = { 3, 1, 4, 0, 2 };
+        if (!checkVector<int>(gotOutput_5, expectedOutput_5.data(), 5, 0)) {
+            MNN_ERROR("ArgSortTest test axis_0 failed!\n");
+            return false;
+        }
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(SortTest, "op/sort");
diff --git a/test/op/StridedSliceTest.cpp b/test/op/StridedSliceTest.cpp
index 4ba42ddf..80b4506c 100644
--- a/test/op/StridedSliceTest.cpp
+++ b/test/op/StridedSliceTest.cpp
@@ -123,6 +123,26 @@ public:
             MNN_ERROR("stridedslice dim = 3, stride=-1 test failed!\n");
             return false;
         }
+#ifdef MNN_STRIDESLICE_WRITE
+        // 9. write
+        const int begin_data9[] = {0, 0, 0, 0};
+        memcpy(begin->writeMap<int>(), begin_data9, 4 * sizeof(int));
+        const int end_data9[] = {1, 2, 2, 3};
+        memcpy(end->writeMap<int>(), end_data9, 4 * sizeof(int));
+        const int stride_data9[] = {1, 1, 1, 1};
+        memcpy(strided->writeMap<int>(), stride_data9, 4 * sizeof(int));
+        auto write = _Input({3}, NCHW);
+        const float write_data[] = {9, 9, 9};
+        memcpy(write->writeMap<float>(), write_data, 3 * sizeof(float));
+        auto output_9= _StridedSliceWrite(input, begin, end, strided, write, 0, 0, 0, 0, 0);
+        const std::vector<int> expectedShape_9 = {1, 3, 2, 3};
+        const std::vector<float> expectedOutput_9 = {9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 5, 5, 5, 6, 6, 6};
+        if (!checkVector<int>(output_9->getInfo()->dim.data(), expectedShape_9.data(), expectedShape_9.size(), 0) ||
+            !checkVector<float>(output_9->readMap<float>(), expectedOutput_9.data(), expectedOutput_9.size(), 0.01)) {
+            MNN_ERROR("stridedslicewrite test failed!\n");
+            return false;
+        }
+#endif
         return true;
     }
 };
diff --git a/test/op/UnaryTest.cpp b/test/op/UnaryTest.cpp
index 71c5f258..7c65d97e 100644
--- a/test/op/UnaryTest.cpp
+++ b/test/op/UnaryTest.cpp
@@ -61,7 +61,7 @@ class AbsTest : public UnaryTestCommon {
 public:
     virtual ~AbsTest() = default;
     virtual bool run(int precision) {
-        return test<float, float>(_Abs, "AbsTest", 0.01,
+        return test<float, float>(MNN::Express::_Abs, "AbsTest", 0.01,
                     {-1.0, -2.0, 3.0, 4.0, -1.0, -2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0},
                     {8}, {8});
     }
diff --git a/tools/converter/CMakeLists.txt b/tools/converter/CMakeLists.txt
index 99091057..72765327 100644
--- a/tools/converter/CMakeLists.txt
+++ b/tools/converter/CMakeLists.txt
@@ -13,6 +13,7 @@ IF(MNN_BUILD_CONVERTER)
           set(Protobuf_INCLUDE_DIRS ${PROTOBUF_INCLUDE_DIRS})
       endif()
   ENDIF()
+  SET(Protobuf_LIBRARIES ${Protobuf_LIBRARIES} PARENT_SCOPE)
   add_definitions(-DGOOGLE_PROTOBUF_NO_RTTI)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/source/tflite/schema/)
@@ -41,8 +42,17 @@ IF(MNN_BUILD_CONVERTER)
     ${CMAKE_CURRENT_LIST_DIR}/source/MNNConverter.cpp
   )
   IF(MNN_BUILD_SHARED_LIBS)
-    add_library(MNNConvertDeps SHARED ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp $<TARGET_OBJECTS:MNNUtils>)
-    add_dependencies(MNNConvertDeps MNN)
+    IF(MNN_SEP_BUILD)
+      add_library(MNNConvertDeps SHARED ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp $<TARGET_OBJECTS:MNNUtils>)
+      add_dependencies(MNNConvertDeps MNN)
+    ELSE()
+      add_library(MNNConvertDeps OBJECT ${COMMON_SRC} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp)
+      target_compile_definitions(MNNConvertDeps PRIVATE BUILDING_MNN_DLL PROTOBUF_USE_DLLS INTERFACE USING_MNN_DLL)
+      FOREACH(TARGET ${MNN_CONVERTER_BACKENDS_TARGETS})
+        target_compile_definitions(${TARGET} PRIVATE BUILDING_MNN_DLL PROTOBUF_USE_DLLS INTERFACE USING_MNN_DLL)
+      ENDFOREACH()
+      target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNConvertDeps> ${MNN_CONVERTER_BACKENDS_OBJECTS})
+    ENDIF()
   ELSE()
     add_library(MNNConvertDeps STATIC ${COMMON_SRC} ${MNN_CONVERTER_BACKENDS_OBJECTS} ${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/flatbuffers/src/util.cpp)
   ENDIF()
@@ -68,6 +78,12 @@ IF(MNN_BUILD_CONVERTER)
     ELSE()
         target_link_libraries(MNNConvert MNNConvertDeps)
     endif()
+  ELSEIF(NOT MNN_SEP_BUILD)
+    add_executable(TestConvertResult ${CMAKE_CURRENT_LIST_DIR}/source/TestConvertResult.cpp)
+    target_link_libraries(TestConvertResult MNN)
+    add_executable(TestPassManager ${CMAKE_CURRENT_LIST_DIR}/source/TestPassManager.cpp)
+    target_link_libraries(TestPassManager MNN)
+    target_link_libraries(MNNConvert MNN)
   ELSE()
     target_link_libraries(MNNConvertDeps PUBLIC ${MNN_DEPS} ${Protobuf_LIBRARIES})
     if (MNN_BUILD_TORCH)
diff --git a/tools/converter/source/onnx/IfOnnx.cpp b/tools/converter/source/onnx/IfOnnx.cpp
index 7abac834..4e785538 100644
--- a/tools/converter/source/onnx/IfOnnx.cpp
+++ b/tools/converter/source/onnx/IfOnnx.cpp
@@ -60,9 +60,17 @@ void IfOnnx::run(MNN::OpT* dstOp, const onnx::NodeProto* onnxNode,
         MNN_ERROR("Op(If) and its subgraphs (then_branch, else_branch) must have same output number\n");
         return;
     }
+    for (int i = 0; i < onnxNode->output_size(); ++i) {
+        std::unique_ptr<MNN::StringVecT> pair(new MNN::StringVecT);
+        pair->data.assign({thenOutputs[i], elseOutputs[i]});
+        param->aliases_outputs.emplace_back(std::move(pair));
+    }
     auto mergeInputs = thenInputs;
-    std::copy_if(elseInputs.begin(), elseInputs.end(), mergeInputs.end(),
-        [&](std::string& n) { return std::find(thenInputs.begin(), thenInputs.end(), n) == thenInputs.end(); });
+    for (const auto& name : elseInputs) {
+        if (std::find(thenInputs.begin(), thenInputs.end(), name) == thenInputs.end()) {
+            mergeInputs.push_back(name);
+        }
+    }
     { // cond input
         std::unique_ptr<MNN::StringVecT> pair(new MNN::StringVecT);
         param->aliases_inputs.emplace_back(std::move(pair));
diff --git a/tools/converter/source/onnx/LoopOnnx.cpp b/tools/converter/source/onnx/LoopOnnx.cpp
index aa12485c..53adeb81 100644
--- a/tools/converter/source/onnx/LoopOnnx.cpp
+++ b/tools/converter/source/onnx/LoopOnnx.cpp
@@ -20,6 +20,10 @@ MNN::OpParameter LoopOnnx::type() {
 
 void LoopOnnx::run(MNN::OpT* dstOp, const onnx::NodeProto* onnxNode,
                    OnnxScope* scope) {
+    if(onnxNode->input(0) == "" || onnxNode->input(1) == "") {
+        MNN_ERROR("Failed: Loop don't support optional M and cond input\n");
+        return;
+    }
     auto param = new MNN::WhileParamT;
     dstOp->name += "/Loop";
     param->body_graph = dstOp->name +  "/body";
diff --git a/tools/converter/source/onnx/onnxConverter.cpp b/tools/converter/source/onnx/onnxConverter.cpp
index a0407130..c21827d9 100644
--- a/tools/converter/source/onnx/onnxConverter.cpp
+++ b/tools/converter/source/onnx/onnxConverter.cpp
@@ -96,10 +96,12 @@ int onnx2MNNNet(const std::string inputModel, const std::string bizCode,
             int inputIdx = scope->lookupTensor(onnxNode.input(k));
             if (inputIdx < 0) {
                 LOG(INFO) << "Check it out ==> " << MNNOp->name << " has empty input, the index is " << k;
-                continue;
             }
             MNNOp->inputIndexes.push_back(inputIdx);
         }
+        for (int k = onnxNode.input_size() - 1; k >= 0 && MNNOp->inputIndexes[k] < 0; --k) {
+            MNNOp->inputIndexes.pop_back();
+        }
         for (int k = 0; k < onnxNode.output_size(); k++) {
             MNNOp->outputIndexes.push_back(scope->declareTensor(onnxNode.output(k)));
         }
diff --git a/tools/converter/source/optimizer/Program.cpp b/tools/converter/source/optimizer/Program.cpp
index 39c2939d..61bfa4ed 100644
--- a/tools/converter/source/optimizer/Program.cpp
+++ b/tools/converter/source/optimizer/Program.cpp
@@ -34,6 +34,10 @@ void Program::createUnit(std::map<int, VARP>& varMap, std::vector<int>& inputInd
     }
     invalidSet.insert(op);
     for (auto input : op->inputIndexes) {
+        if (input < 0) { // optional input
+            inputVars.emplace_back(nullptr);
+            continue;
+        }
         if (varMap.find(input) == varMap.end()) {
             for (int j = 0; j < oplists.size(); ++j) {
                 for (auto outputIndex : oplists[j]->outputIndexes) {
diff --git a/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp b/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp
index 09d3d4c0..074938ce 100644
--- a/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp
+++ b/tools/converter/source/optimizer/merge/ConvBiasAdd.cpp
@@ -30,7 +30,7 @@ static auto gRegister = []() {
         if (inputExpr->get()->type() == OpType_Reshape) {
             inputExpr = inputExpr->inputs()[0]->expr().first;
         }
-        if (inputExpr->get()->main_type() != OpParameter_Convolution2D || inputExpr->outputs().size() != 1) {
+        if (!inputExpr->get() || inputExpr->get()->main_type() != OpParameter_Convolution2D || inputExpr->outputs().size() != 1) {
             return false;
         }
         if (inputExpr->inputs().size() > 1) {
diff --git a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
index 4afca6f8..799fdf11 100644
--- a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
+++ b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
@@ -31,6 +31,15 @@ static VARP _ReshapeF(VARP x, VARP shape, MNN::MNN_DATA_FORMAT format) {
     reshape->main.AsReshape()->dimType = format;
     return (Variable::create(Expr::create(reshape.get(), {x, shape})));
 }
+static VARP _ConvertF(VARP input, MNN::MNN_DATA_FORMAT format) {
+    std::unique_ptr<OpT> convert(new OpT);
+    convert->type                               = OpType_ConvertTensor;
+    convert->main.type                          = OpParameter_TensorConvertInfo;
+    convert->main.value                         = new TensorConvertInfoT;
+    convert->main.AsTensorConvertInfo()->source = MNN_DATA_FORMAT_NC4HW4;
+    convert->main.AsTensorConvertInfo()->dest   = format;
+    return (Variable::create(Expr::create(convert.get(), {input})));
+}
 static bool checkInputInfo(const std::string& exprName, const Variable::Info* info, const modelConfig* config) {
     if (nullptr == info) {
         if (config->optimizeLevel < 1) {
@@ -200,7 +209,6 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
             dense->common->outputCount = num_output;
 
             std::unique_ptr<OpT> dense_op(new OpT);
-            dense_op->name       = expr->name();
             dense_op->type       = OpType_Convolution;
             dense_op->main.type  = OpParameter_Convolution2D;
             dense_op->main.value = dense.release();
@@ -227,7 +235,10 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
             }
             EXPRP dense_expr = Expr::create(dense_op.get(), {input}, 1);
             VARP output = Variable::create(dense_expr);
+            //MNN_PRINT("%d\n", output->getInfo()->order);
+            output = _ConvertF(output, format);
             VARP reshapeVar = _ReshapeF(output, _Concat({inputRemain, inputE, outputH}, 0), format);
+            reshapeVar->setName(expr->outputName(0));
             Expr::replace(expr, reshapeVar->expr().first);
 
             return true /*modified*/;
diff --git a/tools/converter/source/optimizer/merge/MergeHelpers.cpp b/tools/converter/source/optimizer/merge/MergeHelpers.cpp
index 0a8a3b59..5f63666c 100644
--- a/tools/converter/source/optimizer/merge/MergeHelpers.cpp
+++ b/tools/converter/source/optimizer/merge/MergeHelpers.cpp
@@ -116,6 +116,9 @@ std::vector<VARP> OutputVars(EXPRP expr) {
             continue;
         }
         for (VARP output : child->inputs()) {
+            if (output.get() == nullptr) {
+                continue;
+            }
             int output_index = 0;
             EXPRP parent;
             std::tie(parent, output_index) = output->expr();
diff --git a/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp b/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp
index b7c8abb8..912d5fe8 100644
--- a/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp
+++ b/tools/converter/source/optimizer/merge/TensorConverterMerge.cpp
@@ -174,7 +174,7 @@ static auto gRegister = []() {
             }
             auto inputs = expr->inputs();
             for (auto input : inputs) {
-                if (input->expr().first->get() == nullptr) {
+                if (input.get() == nullptr || input->expr().first->get() == nullptr) {
                     continue;
                 }
                 auto subOp = input->expr().first->get();
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
index d7258af0..b77597b0 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxClip.cpp
@@ -20,7 +20,6 @@ public:
         auto extraParam = op->main_as_Extra();
         float maxValue  = std::numeric_limits<float>().max();
         float minValue  = -std::numeric_limits<float>().max();
-        bool setReady   = false;
         if (nullptr != extraParam->attr()) {
             const int attrSize = extraParam->attr()->size();
             for (int i = 0; i < attrSize; ++i) {
@@ -28,38 +27,37 @@ public:
                 const auto& key = attr->key()->str();
                 if (key == "max") {
                     maxValue = attr->f();
-                    setReady = true;
                 } else if (key == "min") {
                     minValue = attr->f();
-                    setReady = true;
                 }
             }
         }
-        bool known_min_max = true;
-        if (inputs.size() == 2 && (!setReady)) {
+        bool unknown_min_max = false;
+        if (inputs.size() == 2 || (inputs.size() == 3 && inputs[1].get() != nullptr)) {
             auto minPtr = inputs[1]->readMap<float>();
             if (nullptr != minPtr) {
                 minValue = minPtr[0];
             } else {
-                known_min_max = false;
+                unknown_min_max = true;
             }
         }
-        if (inputs.size() >= 3 && (!setReady)) {
-            auto minPtr = inputs[1]->readMap<float>();
-            if (nullptr != minPtr) {
-                minValue = minPtr[0];
-            } else {
-                known_min_max = false;
-            }
+        if (inputs.size() == 3 && !unknown_min_max) {
             auto maxPtr = inputs[2]->readMap<float>();
             if (nullptr != maxPtr) {
                 maxValue = maxPtr[0];
             } else {
-                known_min_max = false;
+                unknown_min_max = true;
             }
         }
-        if (!known_min_max) {
-            auto res = _Minimum(_Maximum(inputs[0], inputs[1]), inputs[2]);
+        if (unknown_min_max) {
+            auto minVar = _Scalar<float>(minValue), maxVar = _Scalar<float>(maxValue);
+            if (inputs.size() >= 2 && inputs[1].get() != nullptr) {
+                minVar = inputs[1];
+            }
+            if (inputs.size() >= 3) {
+                maxVar = inputs[2];
+            }
+            auto res = _Minimum(_Maximum(inputs[0], minVar), maxVar);
             auto newExpr = res->expr().first;
             newExpr->setName(expr->name());
             return newExpr;
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp
index 9c46012b..00b6b656 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxLSTMMerge.cpp
@@ -16,7 +16,18 @@ class OnnxLSTMTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
         auto inputs = expr->inputs();
-        MNN_ASSERT(inputs.size() >= 4);
+        if (inputs.size() == 8) {
+            MNN_ERROR("MNN LSTM not support 8th input (peepholes)\n");
+            return nullptr;
+        }
+        if (inputs.size() >= 5 && inputs[4].get() != nullptr) {
+            MNN_ERROR("MNN LSTM not support sequence_lens, all batch must be seq_length\n");
+            return nullptr;
+        }
+        if (inputs.size() < 4 || inputs[3].get() == nullptr) {
+            MNN_ERROR("MNN LSTM not support optional 4th input (must provide B)\n");
+            return nullptr;
+        }
         std::unique_ptr<OpT> lstm(new OpT);
         lstm->name       = expr->name();
         if (expr->get()->main_as_Extra()->type()->str() == "RNN") {
@@ -41,6 +52,9 @@ public:
         // onnx docs guarantee bias shape is [num_direction, 8 * hidden_size], we split it to 2x [num_dicection, 4 * hidden_size] (W/R), then add together
         auto biasWR = _Split(inputs[3], {2}, 1);
         inputs[3] = _Add(biasWR[0], biasWR[1]);
+        if (inputs.size() >= 5) {
+            inputs.erase(inputs.begin() + 4); // ignore sequence_lens
+        }
         // Y, Y_h, Y_c
         auto originLSTM = Expr::create(lstm.get(), inputs, (lstm->type == OpType_RNN ? 2 : 3));
         originLSTM->setName(expr->name());
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp b/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp
index 344198f8..91ce28d1 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxNonMaxSuppression.cpp
@@ -27,10 +27,21 @@ public:
         // onnx scores is 3D [num_batches, num_classes, boxes_num] with num_batches = 1,
         // while tf scores is 1D [boxes_num].
         auto inputs = expr->inputs();
-        // 3th input is max_output_boxes_per_class(default is 0), making output shape is (0, 3) which MNN isn't support
-        MNN_ASSERT(inputs.size() >= 3);
+        // optional input 3/4/5th
+        if (inputs.size() < 3 || inputs[2].get() == nullptr) {
+            MNN_ERROR("NonMaxSuppression's max_output_boxes_per_class must be provided (can't optional)\n");
+            return nullptr;
+        }
+        auto zero = _Scalar<int>(0);
+        for (int i = 3; i < inputs.size(); ++i) {
+            if (inputs[i].get() == nullptr) {
+                inputs[i] = zero;
+            }
+        }
+        
         auto input0Info = inputs[0]->getInfo();
         auto input1Info = inputs[1]->getInfo();
+        
         if (nullptr == input0Info || nullptr == input1Info) {
             MNN_ERROR("Shape of NonMaxSupression's input is unknown. Please confirm version of MNN engine is new enough and use V3 Module API to run it correctly\n");
             std::unique_ptr<OpT> nms(new OpT);
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp
index 3e7fc7c7..8402012e 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxSequenceGRUMerge.cpp
@@ -23,7 +23,10 @@ class OnnxSequenceGRUTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
         auto inputs = expr->inputs();
-        MNN_ASSERT(inputs.size() >= 4); // X W R B
+        if (inputs.size() < 4 || inputs[3].get() == nullptr) { // X W R B
+            MNN_ERROR("Don't support optional 4th input (B)\n");
+            return nullptr;
+        }
         auto rnnGRUParam = new MNN::RNNParamT;
         std::unique_ptr<OpT> gru(new OpT);
         gru->name       = expr->name();
@@ -108,8 +111,12 @@ public:
         }
 
         // auto sequence_lens = inputs[4]; sequence_lens is ommitted at onnxConverter.cpp
-        if (inputs.size() > 4) { // initial_h exist, shape is [num_directions, batch_size, hidden_size]
-            gruInput.push_back(inputs[4]);
+        if (inputs.size() > 4 && inputs[4].get() != nullptr) {
+            MNN_ERROR("Don't support sequence_lens input, all batch have seq_length\n");
+            return nullptr;
+        }
+        if (inputs.size() > 5) { // initial_h exist, shape is [num_directions, batch_size, hidden_size]
+            gruInput.push_back(inputs[5]);
         }
 
         auto gruExpr = Expr::create(gru.get(), gruInput, expr->outputSize());
diff --git a/tools/converter/source/optimizer/passes/Pass.hpp b/tools/converter/source/optimizer/passes/Pass.hpp
index a36a1433..8349fbdd 100644
--- a/tools/converter/source/optimizer/passes/Pass.hpp
+++ b/tools/converter/source/optimizer/passes/Pass.hpp
@@ -61,6 +61,7 @@ public:
     PassManager() = delete;
     PassManager(PassContext *context) : context_(context) {}
     PassManager(const PassManager& other);
+    PassManager& operator=(const PassManager&) = delete;
 
     virtual ~PassManager() = default;
 
diff --git a/tools/converter/source/optimizer/passes/PassRegistry.cpp b/tools/converter/source/optimizer/passes/PassRegistry.cpp
index 6c3fbe0d..62f6b863 100644
--- a/tools/converter/source/optimizer/passes/PassRegistry.cpp
+++ b/tools/converter/source/optimizer/passes/PassRegistry.cpp
@@ -8,7 +8,6 @@
 
 #include <string>
 #include <unordered_map>
-#include <mutex>
 
 #include "MNN/MNNDefine.h"
 #include "converter/source/optimizer/passes/PassRegistry.hpp"
@@ -29,10 +28,7 @@ static std::vector<std::unique_ptr<PassManager>>* AllRegisteredPassManagers() {
     return &g_registered_pass_managers;
 }
 
-static std::mutex g_mutex;
-
 /*static*/ PassManager* PassManagerRegistry::GetPassManager(int index) {
-    std::lock_guard<std::mutex> lock(g_mutex);
     auto* g_registered_pass_managers = AllRegisteredPassManagers();
     MNN_CHECK(index < g_registered_pass_managers->size(),
               "The pass manager index is out of bounds.");
@@ -40,7 +36,6 @@ static std::mutex g_mutex;
 }
 
 /*static*/ std::vector<PassManager*> PassManagerRegistry::GetAllPassManagers() {
-    std::lock_guard<std::mutex> lock(g_mutex);
     std::vector<PassManager*> pass_managers;
     for (auto& pm : *(AllRegisteredPassManagers())) {
         pass_managers.push_back(pm.get());
@@ -49,19 +44,16 @@ static std::mutex g_mutex;
 }
 
 /*static*/ void PassManagerRegistry::AddPassManager(const PassManager& pm) {
-    std::lock_guard<std::mutex> lock(g_mutex);
     auto* g_registered_pass_managers = AllRegisteredPassManagers();
     g_registered_pass_managers->emplace_back(new PassManager(pm));
 }
 
 /*static*/ void PassRegistry::AddPass(std::unique_ptr<Pass>&& pass) {
-    std::lock_guard<std::mutex> lock(g_mutex);
     auto* g_registered_passes = AllRegisteredPasses();
     g_registered_passes->emplace(pass->name(), std::move(pass));
 }
 
 /*static*/ Pass* PassRegistry::GetPass(const std::string& pass_name) {
-    std::lock_guard<std::mutex> lock(g_mutex);
     auto* g_registered_passes = AllRegisteredPasses();
     const auto& it = g_registered_passes->find(pass_name);
     if (it != g_registered_passes->end()) {
diff --git a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
index f5fc458a..684abec9 100644
--- a/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
+++ b/tools/converter/source/optimizer/postconvert/AddTensorFormatConverter.cpp
@@ -426,6 +426,9 @@ public:
             auto currentName         = op->name;
             for (int i = 0; i < op->inputIndexes.size(); ++i) {
                 auto inputIndex = op->inputIndexes[i];
+                if (inputIndex < 0) {
+                    continue; // optional input, ignore it
+                }
                 auto type = tensorFormats[inputIndex];
                 auto requireType = _getRequireFormat(formatType, i, tensorFormats[op->outputIndexes[0]], originTensorType);
                 if (type == requireType) {
diff --git a/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp b/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp
index 9ed887a7..72f51710 100644
--- a/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp
+++ b/tools/converter/source/optimizer/postconvert/ReIndexTensor.cpp
@@ -21,6 +21,9 @@ public:
         std::vector<bool> tensorValid(mNet->tensorName.size(), false);
         for (auto& op : mNet->oplists) {
             for (auto index : op->inputIndexes) {
+                if (index < 0) {
+                    continue; // optional input, ignore it
+                }
                 tensorValid[index] = true;
             }
             for (auto index : op->outputIndexes) {
@@ -38,6 +41,9 @@ public:
         // Re index
         for (auto& op : mNet->oplists) {
             for (int i = 0; i < op->inputIndexes.size(); ++i) {
+                if (op->inputIndexes[i] < 0) {
+                    continue;
+                }
                 auto iter = usefulTensorIndexMap.find(op->inputIndexes[i]);
                 DCHECK(iter != usefulTensorIndexMap.end()) << "ERROR";
                 op->inputIndexes[i] = iter->second;
diff --git a/tools/cpp/MNNV2Basic.cpp b/tools/cpp/MNNV2Basic.cpp
index b5f0c531..3c149c50 100644
--- a/tools/cpp/MNNV2Basic.cpp
+++ b/tools/cpp/MNNV2Basic.cpp
@@ -397,6 +397,8 @@ static int test_main(int argc, const char* argv[]) {
         auto outputFile = pwd + "output.txt";
         if (outputTensor->size() > 0) {
             dumpTensor2File(&expectTensor, outputFile.c_str(), orderFileOs);
+        } else {
+            MNN_ERROR("output size is 0, can't save\n");
         }
     }
     auto allOutputs = net->getSessionOutputAll(session);
diff --git a/tools/cpp/backendTest.cpp b/tools/cpp/backendTest.cpp
index bf022944..0a3b8ce0 100644
--- a/tools/cpp/backendTest.cpp
+++ b/tools/cpp/backendTest.cpp
@@ -70,7 +70,8 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo
             if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) {
                 return true;
             }
-            std::shared_ptr<MNN::Tensor> copyTensor(MNN::Tensor::createHostTensorFromDevice(tensor, true));
+            std::shared_ptr<MNN::Tensor> copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType()));
+            tensor->copyToHostTensor(copyTensor.get());
             correctResult.emplace_back(copyTensor);
         }
         return true;
@@ -90,7 +91,8 @@ static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNFo
             if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) {
                 return true;
             }
-            std::shared_ptr<MNN::Tensor> copyTensor(MNN::Tensor::createHostTensorFromDevice(tensor, true));
+            std::shared_ptr<MNN::Tensor> copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType()));
+            tensor->copyToHostTensor(copyTensor.get());
             auto expectTensor = correctResult[index++];
             auto correct      = TensorUtils::compareTensors(copyTensor.get(), expectTensor.get(), tolerance, true);
             if (!correct) {
diff --git a/tools/cpp/testModelWithDescrisbe.cpp b/tools/cpp/testModelWithDescrisbe.cpp
index 5d25ed7d..e5b0149a 100644
--- a/tools/cpp/testModelWithDescrisbe.cpp
+++ b/tools/cpp/testModelWithDescrisbe.cpp
@@ -116,7 +116,13 @@ bool compareVar(VARP var, std::string name) {
     auto diffAbsMax = _ReduceMax(diff);
     auto absMaxV = absMax->readMap<T>()[0];
     auto diffAbsMaxV = diffAbsMax->readMap<T>()[0];
-    if (absMaxV * 0.01f < diffAbsMaxV || std::isnan(absMaxV)) {
+    // The implemention of isnan in VS2017 isn't accept integer type, so cast all type to double
+#ifdef _MSC_VER
+#define ALI_ISNAN(x) std::isnan(static_cast<long double>(x))
+#else
+#define ALI_ISNAN(x) std::isnan(x)
+#endif
+    if (absMaxV * 0.01f < diffAbsMaxV || ALI_ISNAN(absMaxV)) {
         std::cout << "TESTERROR " << name << " value error : absMaxV:" << absMaxV << " - DiffMax:" << diffAbsMaxV << std::endl;
         return false;
     }
diff --git a/tools/cv/CMakeLists.txt b/tools/cv/CMakeLists.txt
index a067b928..2275cdc1 100644
--- a/tools/cv/CMakeLists.txt
+++ b/tools/cv/CMakeLists.txt
@@ -16,11 +16,15 @@ IF(MNN_BUILD_OPENCV)
     file(GLOB_RECURSE IMGCODECS_SRC ${CMAKE_CURRENT_LIST_DIR}/source/imgcodecs/*.cpp ${CMAKE_CURRENT_LIST_DIR}/include/cv/imgcodecs/*.hpp)
   endif()
 
-  IF(MNN_BUILD_SHARED_LIBS)
-    add_library(MNNOpenCV SHARED ${IMGPROC_SRC} ${IMGCODECS_SRC})
-    target_link_libraries(MNNOpenCV MNN MNN_Express)
+  IF(MNN_SEP_BUILD)
+    IF(MNN_BUILD_SHARED_LIBS)
+      add_library(MNNOpenCV SHARED ${IMGPROC_SRC} ${IMGCODECS_SRC})
+      target_link_libraries(MNNOpenCV MNN MNN_Express)
+    ELSE()
+      add_library(MNNOpenCV STATIC ${IMGPROC_SRC} ${IMGCODECS_SRC})
+    ENDIF()
   ELSE()
-    add_library(MNNOpenCV STATIC ${IMGPROC_SRC} ${IMGCODECS_SRC})
+    add_library(MNNOpenCV OBJECT ${IMGPROC_SRC} ${IMGCODECS_SRC})
   ENDIF()
   IF(CMAKE_SYSTEM_NAME MATCHES "^Android" AND NOT MNN_BUILD_FOR_ANDROID_COMMAND)
     IF(NOT NATIVE_INCLUDE_OUTPUT)
diff --git a/tools/cv/include/cv/imgproc/draw.hpp b/tools/cv/include/cv/imgproc/draw.hpp
index 9e28cf22..d5333482 100644
--- a/tools/cv/include/cv/imgproc/draw.hpp
+++ b/tools/cv/include/cv/imgproc/draw.hpp
@@ -26,7 +26,8 @@ enum LineTypes {
 
 MNN_PUBLIC void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color,
                             int thickness=1, int line_type=8, int shift=0, double tipLength=0.1);
-MNN_PUBLIC void circle();
+MNN_PUBLIC void circle(VARP& img, Point center, int radius, const Scalar& color,
+                       int thickness=1, int line_type=8, int shift=0);
 
 MNN_PUBLIC void line(VARP& img, Point pt1, Point pt2, const Scalar& color,
                      int thickness = 1, int lineType = LINE_8, int shift = 0);
@@ -34,6 +35,11 @@ MNN_PUBLIC void line(VARP& img, Point pt1, Point pt2, const Scalar& color,
 MNN_PUBLIC void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color,
                           int thickness = 1, int lineType = LINE_8, int shift = 0);
 
+MNN_PUBLIC void drawContours(VARP& img, std::vector<std::vector<Point>> _contours, int contourIdx, const Scalar& color,
+                             int thickness = 1, int lineType = LINE_8);
+
+MNN_PUBLIC void fillPoly(VARP& img, std::vector<std::vector<Point>> pts, const Scalar& color,
+                         int line_type = LINE_8, int shift = 0, Point offset = {0, 0});
 } // CV
 } // MNN
 #endif // DRAW_HPP
diff --git a/tools/cv/include/cv/imgproc/geometric.hpp b/tools/cv/include/cv/imgproc/geometric.hpp
index 52d6135c..26840124 100644
--- a/tools/cv/include/cv/imgproc/geometric.hpp
+++ b/tools/cv/include/cv/imgproc/geometric.hpp
@@ -56,11 +56,12 @@ MNN_PUBLIC Matrix getRotationMatrix2D(Point center, double angle, double scale);
 MNN_PUBLIC Matrix invertAffineTransform(Matrix M);
 
 MNN_PUBLIC VARP resize(VARP src, Size dsize, double fx = 0, double fy = 0,
-                       int interpolation = INTER_LINEAR);
+                       int interpolation = INTER_LINEAR, int code = -1,
+                       std::vector<float> mean = {}, std::vector<float> norm = {});
 
 MNN_PUBLIC VARP warpAffine(VARP src, Matrix M, Size dsize,
-                           int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT,
-                           int borderValue = 0);
+                           int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT, int borderValue = 0,
+                           int code = -1, std::vector<float> mean = {}, std::vector<float> norm = {});
 
 MNN_PUBLIC VARP warpPerspective(VARP src, Matrix M, Size dsize,
                                 int flags = INTER_LINEAR, int borderMode = BORDER_CONSTANT,
diff --git a/tools/cv/include/cv/imgproc/structural.hpp b/tools/cv/include/cv/imgproc/structural.hpp
index 482717e1..faf4ff7a 100644
--- a/tools/cv/include/cv/imgproc/structural.hpp
+++ b/tools/cv/include/cv/imgproc/structural.hpp
@@ -44,13 +44,13 @@ public:
 };
 typedef std::vector<Point> POINTS;
 
-MNN_PUBLIC std::vector<POINTS> findContours(VARP image, int mode, int method, Point offset = {0, 0});
-MNN_PUBLIC double contourArea(POINTS _contour, bool oriented = false);
-MNN_PUBLIC std::vector<int> convexHull(POINTS _points, bool clockwise = false, bool returnPoints = true);
-MNN_PUBLIC RotatedRect minAreaRect(POINTS _points);
-MNN_PUBLIC Rect2i boundingRect(POINTS points);
+MNN_PUBLIC std::vector<VARP> findContours(VARP image, int mode, int method, Point offset = {0, 0});
+MNN_PUBLIC double contourArea(VARP _contour, bool oriented = false);
+MNN_PUBLIC std::vector<int> convexHull(VARP _points, bool clockwise = false, bool returnPoints = true);
+MNN_PUBLIC RotatedRect minAreaRect(VARP _points);
+MNN_PUBLIC Rect2i boundingRect(VARP points);
 MNN_PUBLIC int connectedComponentsWithStats(VARP image, VARP& labels, VARP& statsv, VARP& centroids, int connectivity = 8);
-MNN_PUBLIC POINTS boxPoints(RotatedRect box);
+MNN_PUBLIC VARP boxPoints(RotatedRect box);
 } // CV
 } // MNN
 #endif // STRUCTURAL_HPP
diff --git a/tools/cv/include/cv/types.hpp b/tools/cv/include/cv/types.hpp
index ee585f25..9cc4aee7 100644
--- a/tools/cv/include/cv/types.hpp
+++ b/tools/cv/include/cv/types.hpp
@@ -18,7 +18,7 @@ using namespace Express;
 
 #define MNN_PI 3.1415926535897932384626433832795
 
-typedef char schar;
+typedef signed char schar;
 typedef unsigned char uchar;
 
 // Size Start
@@ -194,6 +194,7 @@ public:
 
     Point_& operator = (const Point_& pt);
     Point_& operator = (Point_&& pt);
+    template<typename _Tp2> operator Point_<_Tp2>() const;
 
     _Tp x; //!< x coordinate of the point
     _Tp y; //!< y coordinate of the point
@@ -237,6 +238,32 @@ Point_<_Tp>& Point_<_Tp>::operator = (Point_&& pt)
     x = std::move(pt.x); y = std::move(pt.y);
     return *this;
 }
+
+template<typename _Tp> template<typename _Tp2> inline
+Point_<_Tp>::operator Point_<_Tp2>() const
+{
+    return Point_<_Tp2>(static_cast<_Tp2>(x), static_cast<_Tp2>(y));
+}
+
+template<typename _Tp> static inline
+Point_<_Tp>& operator += (Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+}
+
+template<typename _Tp> static inline
+Point_<_Tp> operator - (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return Point_<_Tp>( static_cast<_Tp>(a.x - b.x), static_cast<_Tp>(a.y - b.y) );
+}
+
+template<typename _Tp> static inline
+bool operator != (const Point_<_Tp>& a, const Point_<_Tp>& b)
+{
+    return a.x != b.x || a.y != b.y;
+}
 // Point End
 // Rect Start
 template<typename _Tp> class Rect_
@@ -361,11 +388,21 @@ template<typename _Tp> class Scalar_ {
 public:
     //! default constructor
     Scalar_();
-    Scalar_(_Tp _r, _Tp _g, _Tp _b) : r(_r), g(_g), b(_b), a(255) {};
-    Scalar_(_Tp _r, _Tp _g, _Tp _b, _Tp _a) : r(_r), g(_g), b(_b), a(_a) {};
-    _Tp r, g, b, a;
+    Scalar_(_Tp _r, _Tp _g, _Tp _b) {
+        val[0] = _r;
+        val[1] = _g;
+        val[2] = _b;
+        val[3] = 255;
+    };
+    Scalar_(_Tp _r, _Tp _g, _Tp _b, _Tp _a) {
+        val[0] = _r;
+        val[1] = _g;
+        val[2] = _b;
+        val[3] = _a;
+    };
+    _Tp val[4];
 };
-typedef Scalar_<uint8_t> Scalar;
+typedef Scalar_<double> Scalar;
 // Scalar End
 
 static void getVARPSize(VARP var, int* height, int* width, int* channel) {
@@ -406,6 +443,9 @@ static int getVARPChannel(VARP var) {
     getVARPSize(var, &h, &w, &c);
     return c;
 }
+static int getVARPByte(VARP var) {
+    return var->getInfo()->type.bytes();
+}
 } // CV
 } // MNN
 #endif // TYPES_HPP
diff --git a/tools/cv/source/imgcodecs/imgcodecs.cpp b/tools/cv/source/imgcodecs/imgcodecs.cpp
index 03b23f71..80f50316 100644
--- a/tools/cv/source/imgcodecs/imgcodecs.cpp
+++ b/tools/cv/source/imgcodecs/imgcodecs.cpp
@@ -17,6 +17,8 @@
 #define STBI_ONLY_JPEG
 #define STBI_ONLY_PNG
 #define STBI_ONLY_BMP
+#define STB_IMAGE_STATIC
+
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "stb_image_write.h"
@@ -117,9 +119,12 @@ VARP imread(const std::string& filename, int flags) {
 }
 
 bool imwrite(const std::string& filename, VARP img, const std::vector<int>& params) {
-    VARP rgb = cvtColor(img, COLOR_BGR2RGB);
     int height, width, channel;
-    getVARPSize(rgb, &height, &width, &channel);
+    getVARPSize(img, &height, &width, &channel);
+    if (channel == 3) {
+        img = cvtColor(img, COLOR_BGR2RGB);
+    }
+
     auto ext = getExt(filename);
     if (ext == "jpg" || ext == "jpeg") {
         int quality = 95;
@@ -129,13 +134,13 @@ bool imwrite(const std::string& filename, VARP img, const std::vector<int>& para
                 break;
             }
         }
-        return stbi_write_jpg(filename.c_str(), width, height, channel, rgb->readMap<uint8_t>(), quality);
+        return stbi_write_jpg(filename.c_str(), width, height, channel, img->readMap<uint8_t>(), quality);
     }
     if (ext == ".png") {
-        return stbi_write_png(filename.c_str(), width, height, channel, rgb->readMap<uint8_t>(), 0);
+        return stbi_write_png(filename.c_str(), width, height, channel, img->readMap<uint8_t>(), 0);
     }
     if (ext == ".bmp") {
-        return stbi_write_bmp(filename.c_str(), width, height, channel, rgb->readMap<uint8_t>());
+        return stbi_write_bmp(filename.c_str(), width, height, channel, img->readMap<uint8_t>());
     }
     return false;
 }
diff --git a/tools/cv/source/imgproc/color.cpp b/tools/cv/source/imgproc/color.cpp
index 2419cf77..3c64dd61 100644
--- a/tools/cv/source/imgproc/color.cpp
+++ b/tools/cv/source/imgproc/color.cpp
@@ -13,7 +13,7 @@
 namespace MNN {
 namespace CV {
 
-static std::pair<CV::ImageFormat, CV::ImageFormat> getSrcDstFormat(int code) {
+std::pair<CV::ImageFormat, CV::ImageFormat> getSrcDstFormat(int code) {
     switch (code) {
 #define CONVERT_SUFFIX(src, dst, suffix) \
         case COLOR_##src##2##dst##_##suffix: \
@@ -75,7 +75,8 @@ static std::pair<CV::ImageFormat, CV::ImageFormat> getSrcDstFormat(int code) {
     }
     return {CV::RGB, CV::RGB};
 }
-static int format2Channel(CV::ImageFormat format) {
+
+int format2Channel(CV::ImageFormat format) {
     switch (format) {
         case CV::RGB:
         case CV::BGR:
diff --git a/tools/cv/source/imgproc/draw.cpp b/tools/cv/source/imgproc/draw.cpp
index 9097b39b..447492e5 100644
--- a/tools/cv/source/imgproc/draw.cpp
+++ b/tools/cv/source/imgproc/draw.cpp
@@ -9,53 +9,831 @@
 #include <MNN/ImageProcess.hpp>
 #include "cv/imgproc/draw.hpp"
 #include <MNN/expr/NeuralNetWorkOp.hpp>
+#include <MNN/expr/MathOp.hpp>
 #include <cmath>
 
 namespace MNN {
 namespace CV {
 
 // help functions
-// TODO: replace this function with an Op.
-void bresenham(uint8_t* ptr, int h, int w, int c, int x1, int y1, int x2, int y2, Scalar color) {
-    int x = x1;
-    int y = y1;
-    int dx = abs(x2 - x1);
-    int dy = abs(y2 - y1);
-    int s1 = x2 > x1 ? 1 : -1;
-    int s2 = y2 > y1 ? 1 : -1;
-    bool interchange = false;
-    if (dy > dx) {
-        std::swap(dx, dy);
-        interchange = true;
-    }
-    int p = 2 * dy - dx;
-    for(int i = 0; i <= dx; i++) {
-        // printf("[%d, %d]\n", x, y);
-        memcpy(ptr + (y * w + x) * c, &color, c);
-        if (p >= 0) {
-            if (interchange) {
-                x += s1;
-            } else {
-                y += s2;
+#define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#define MAX(a,b)  ((a) < (b) ? (b) : (a))
+
+struct Region {
+public:
+    Region(int _y, int _xl, int _xr) : y(_y), xl(_xl), xr(_xr) {}
+    Region(int _y, int _xl) : y(_y), xl(_xl), xr(_xl) {}
+    int y;
+    int xl;
+    int xr;
+};
+
+bool clipLine(Size2l img_size, Point2l& pt1, Point2l& pt2) {
+    int c1, c2;
+    int64_t right = img_size.width-1, bottom = img_size.height-1;
+    if (img_size.width <= 0 || img_size.height <= 0) return false;
+
+    int64_t &x1 = pt1.x, &y1 = pt1.y, &x2 = pt2.x, &y2 = pt2.y;
+    c1 = (x1 < 0) + (x1 > right) * 2 + (y1 < 0) * 4 + (y1 > bottom) * 8;
+    c2 = (x2 < 0) + (x2 > right) * 2 + (y2 < 0) * 4 + (y2 > bottom) * 8;
+
+    if ((c1 & c2) == 0 && (c1 | c2) != 0) {
+        int64_t a;
+        if (c1 & 12) {
+            a = c1 < 8 ? 0 : bottom;
+            x1 += (int64_t)((double)(a - y1) * (x2 - x1) / (y2 - y1));
+            y1 = a;
+            c1 = (x1 < 0) + (x1 > right) * 2;
+        }
+        if (c2 & 12) {
+            a = c2 < 8 ? 0 : bottom;
+            x2 += (int64_t)((double)(a - y2) * (x2 - x1) / (y2 - y1));
+            y2 = a;
+            c2 = (x2 < 0) + (x2 > right) * 2;
+        }
+        if ((c1 & c2) == 0 && (c1 | c2) != 0) {
+            if (c1) {
+                a = c1 == 1 ? 0 : right;
+                y1 += (int64_t)((double)(a - x1) * (y2 - y1) / (x2 - x1));
+                x1 = a;
+                c1 = 0;
+            }
+            if (c2) {
+                a = c2 == 1 ? 0 : right;
+                y2 += (int64_t)((double)(a - x2) * (y2 - y1) / (x2 - x1));
+                x2 = a;
+                c2 = 0;
             }
-            p -= 2 * dx;
         }
-        if (interchange) {
-            y += s2;
-        } else {
-            x += s1;
+        MNN_ASSERT((c1 & c2) != 0 || (x1 | y1 | x2 | y2) >= 0);
+    }
+    return (c1 | c2) == 0;
+}
+bool clipLine(Size img_size, Point2i& pt1, Point2i& pt2) {
+    Point2l p1(pt1.x, pt1.y);
+    Point2l p2(pt2.x, pt2.y);
+    bool inside = clipLine(Size2l(img_size.width, img_size.height), p1, p2);
+    pt1.x = (int)p1.x;
+    pt1.y = (int)p1.y;
+    pt2.x = (int)p2.x;
+    pt2.y = (int)p2.y;
+    return inside;
+}
+
+enum { XY_SHIFT = 16, XY_ONE = 1 << XY_SHIFT, DRAWING_STORAGE_BLOCK = (1<<12) - 256 };
+static void Line(std::vector<Region>& regions, Size size, Point2i pt1_, Point2i pt2_, int connectivity = 8) {
+    if (connectivity == 0) {
+        connectivity = 8;
+    } else if (connectivity == 1) {
+        connectivity = 4;
+    }
+    int count = -1, err, minusDelta, plusDelta, minusStep, plusStep, minusShift, plusShift;
+    Point2i p = Point2i(0, 0);
+    Rect2i rect(0, 0, size.width, size.height);
+    Point2i pt1 = pt1_ - rect.tl();
+    Point2i pt2 = pt2_ - rect.tl();
+
+    if ((unsigned)pt1.x >= (unsigned)(rect.width) || (unsigned)pt2.x >= (unsigned)(rect.width) ||
+        (unsigned)pt1.y >= (unsigned)(rect.height) || (unsigned)pt2.y >= (unsigned)(rect.height)) {
+        if (!clipLine(Size(rect.width, rect.height), pt1, pt2)) {
+            err = plusDelta = minusDelta = plusStep = minusStep = plusShift = minusShift = count = 0;
         }
-        p += 2 * dy;
+    }
+
+    pt1 += rect.tl();
+    pt2 += rect.tl();
+
+    int delta_x = 1, delta_y = 1;
+    int dx = pt2.x - pt1.x;
+    int dy = pt2.y - pt1.y;
+
+    if (dx < 0) {
+        dx = -dx;
+        dy = -dy;
+        pt1 = pt2;
+    }
+
+    if (dy < 0) {
+        dy = -dy;
+        delta_y = -1;
+    }
+
+    bool vert = dy > dx;
+    if (vert) {
+        std::swap(dx, dy);
+        std::swap(delta_x, delta_y);
+    }
+
+    MNN_ASSERT(dx >= 0 && dy >= 0);
+
+    if (connectivity == 8) {
+        err = dx - (dy + dy);
+        plusDelta = dx + dx;
+        minusDelta = -(dy + dy);
+        minusShift = delta_x;
+        plusShift = 0;
+        minusStep = 0;
+        plusStep = delta_y;
+        count = dx + 1;
+    } else /* connectivity == 4 */ {
+        err = 0;
+        plusDelta = (dx + dx) + (dy + dy);
+        minusDelta = -(dy + dy);
+        minusShift = delta_x;
+        plusShift = -delta_x;
+        minusStep = 0;
+        plusStep = delta_y;
+        count = dx + dy + 1;
+    }
+
+    if (vert) {
+        std::swap(plusStep, plusShift);
+        std::swap(minusStep, minusShift);
+    }
+    p = pt1;
+    regions.emplace_back(Region{p.y, p.x});
+    for(int i = 1; i < count; i++) {
+        int mask = err < 0 ? -1 : 0;
+        err += minusDelta + (plusDelta & mask);
+        p.y += minusStep + (plusStep & mask);
+        p.x += minusShift + (plusShift & mask);
+        regions.emplace_back(Region{p.y, p.x});
     }
 }
 
-std::vector<int> getPoints(Point pt1, Point pt2, int thickness) {
-    int x1 = pt1.fX, y1 = pt1.fY, x2 = pt2.fX, y2 = pt2.fY;
-    std::vector<int> pts { x1, y1, x2, y2 };
-    for (int i = 0; i < thickness; i++) {
-        // x - i;
+static void Line2(std::vector<Region>& regions, Size size, Point2l pt1, Point2l pt2) {
+    int64_t dx, dy;
+    int ecount;
+    int64_t ax, ay;
+    int64_t i, j;
+    int x, y;
+    int64_t x_step, y_step;
+    Size2l sizeScaled(((int64_t)size.width) << XY_SHIFT, ((int64_t)size.height) << XY_SHIFT);
+    if(!clipLine(sizeScaled, pt1, pt2)) {
+        return;
     }
-    return pts;
+    dx = pt2.x - pt1.x;
+    dy = pt2.y - pt1.y;
+    j = dx < 0 ? -1 : 0;
+    ax = (dx ^ j) - j;
+    i = dy < 0 ? -1 : 0;
+    ay = (dy ^ i) - i;
+
+    if (ax > ay) {
+        dy = (dy ^ j) - j;
+        pt1.x ^= pt2.x & j;
+        pt2.x ^= pt1.x & j;
+        pt1.x ^= pt2.x & j;
+        pt1.y ^= pt2.y & j;
+        pt2.y ^= pt1.y & j;
+        pt1.y ^= pt2.y & j;
+
+        x_step = XY_ONE;
+        y_step = (dy << XY_SHIFT) / (ax | 1);
+        ecount = (int)((pt2.x - pt1.x) >> XY_SHIFT);
+    } else {
+        dx = (dx ^ i) - i;
+        pt1.x ^= pt2.x & i;
+        pt2.x ^= pt1.x & i;
+        pt1.x ^= pt2.x & i;
+        pt1.y ^= pt2.y & i;
+        pt2.y ^= pt1.y & i;
+        pt1.y ^= pt2.y & i;
+
+        x_step = (dx << XY_SHIFT) / (ay | 1);
+        y_step = XY_ONE;
+        ecount = (int)((pt2.y - pt1.y) >> XY_SHIFT);
+    }
+    pt1.x += (XY_ONE >> 1);
+    pt1.y += (XY_ONE >> 1);
+    regions.emplace_back(Region{(int)((pt2.y + (XY_ONE >> 1)) >> XY_SHIFT), (int)((pt2.x + (XY_ONE >> 1)) >> XY_SHIFT)});
+    if (ax > ay) {
+        pt1.x >>= XY_SHIFT;
+        while(ecount >= 0) {
+            regions.emplace_back(Region{(int)(pt1.y >> XY_SHIFT), (int)(pt1.x)});
+            pt1.x++;
+            pt1.y += y_step;
+            ecount--;
+        }
+    } else {
+        pt1.y >>= XY_SHIFT;
+        while(ecount >= 0) {
+            regions.emplace_back(Region{(int)(pt1.y), (int)(pt1.x >> XY_SHIFT)});
+            pt1.x += x_step;
+            pt1.y++;
+            ecount--;
+        }
+    }
+}
+
+static void FillConvexPoly(std::vector<Region>& regions, Size size, const Point2l* v, int npts, int line_type, int shift) {
+    struct {
+        int idx, di;
+        int64_t x, dx;
+        int ye;
+    } edge[2];
+
+    int delta = 1 << shift >> 1;
+    int i, y, imin = 0;
+    int edges = npts;
+    int64_t xmin, xmax, ymin, ymax;
+    Point2l p0;
+    int delta1, delta2;
+
+    delta1 = delta2 = XY_ONE >> 1;
+
+    p0 = v[npts - 1];
+    p0.x <<= XY_SHIFT - shift;
+    p0.y <<= XY_SHIFT - shift;
+
+    MNN_ASSERT(0 <= shift && shift <= XY_SHIFT);
+    xmin = xmax = v[0].x;
+    ymin = ymax = v[0].y;
+
+    for (i = 0; i < npts; i++) {
+        Point2l p = v[i];
+        if (p.y < ymin) {
+            ymin = p.y;
+            imin = i;
+        }
+
+        ymax = std::max(ymax, p.y);
+        xmax = std::max(xmax, p.x);
+        xmin = MIN(xmin, p.x);
+
+        p.x <<= XY_SHIFT - shift;
+        p.y <<= XY_SHIFT - shift;
+
+        if(!shift) {
+            Point2i pt0, pt1;
+            pt0.x = (int)(p0.x >> XY_SHIFT);
+            pt0.y = (int)(p0.y >> XY_SHIFT);
+            pt1.x = (int)(p.x >> XY_SHIFT);
+            pt1.y = (int)(p.y >> XY_SHIFT);
+            Line(regions, size, pt0, pt1, line_type);
+        } else {
+            Line2(regions, size, p0, p);
+        }
+        p0 = p;
+    }
+
+    xmin = (xmin + delta) >> shift;
+    xmax = (xmax + delta) >> shift;
+    ymin = (ymin + delta) >> shift;
+    ymax = (ymax + delta) >> shift;
+
+    if(npts < 3 || (int)xmax < 0 || (int)ymax < 0 || (int)xmin >= size.width || (int)ymin >= size.height) {
+        return;
+    }
+    ymax = MIN(ymax, size.height - 1);
+    edge[0].idx = edge[1].idx = imin;
+    edge[0].ye = edge[1].ye = y = (int)ymin;
+    edge[0].di = 1;
+    edge[1].di = npts - 1;
+    edge[0].x = edge[1].x = -XY_ONE;
+    edge[0].dx = edge[1].dx = 0;
+    int region_y = y;
+    do {
+        if (y < (int)ymax || y == (int)ymin) {
+            for (i = 0; i < 2; i++) {
+                if (y >= edge[i].ye) {
+                    int idx0 = edge[i].idx, di = edge[i].di;
+                    int idx = idx0 + di;
+                    if (idx >= npts) idx -= npts;
+                    int ty = 0;
+
+                    for (; edges-- > 0; ) {
+                        ty = (int)((v[idx].y + delta) >> shift);
+                        if (ty > y) {
+                            int64_t xs = v[idx0].x;
+                            int64_t xe = v[idx].x;
+                            if (shift != XY_SHIFT)
+                            {
+                                xs <<= XY_SHIFT - shift;
+                                xe <<= XY_SHIFT - shift;
+                            }
+
+                            edge[i].ye = ty;
+                            edge[i].dx = ((xe - xs)*2 + (ty - y)) / (2 * (ty - y));
+                            edge[i].x = xs;
+                            edge[i].idx = idx;
+                            break;
+                        }
+                        idx0 = idx;
+                        idx += di;
+                        if (idx >= npts) idx -= npts;
+                    }
+                }
+            }
+        }
+
+        if (edges < 0)
+            break;
+
+        if (y >= 0) {
+            int left = 0, right = 1;
+            if (edge[0].x > edge[1].x)
+            {
+                left = 1, right = 0;
+            }
+
+            int xx1 = (int)((edge[left].x + delta1) >> XY_SHIFT);
+            int xx2 = (int)((edge[right].x + delta2) >> XY_SHIFT);
+
+            if(xx2 >= 0 && xx1 < size.width)
+            {
+                if(xx1 < 0) {
+                    xx1 = 0;
+                }
+                if(xx2 >= size.width) {
+                    xx2 = size.width - 1;
+                }
+                if (xx2 - xx1 > 0) regions.emplace_back(Region{region_y, xx1, xx2});
+            }
+        }
+
+        edge[0].x += edge[0].dx;
+        edge[1].x += edge[1].dx;
+        region_y++;
+    } while(++y <= (int)ymax);
+}
+
+static void sincos(int angle, float& cosval, float& sinval) {
+    angle += (angle < 0 ? 360 : 0);
+    float radian = angle * MNN_PI / 180;
+    sinval = sin(radian);
+    cosval = cos(radian);
+}
+
+void ellipse2Poly(Point2d center, Size2d axes, int angle, int arc_start, int arc_end, int delta, std::vector<Point2d>& pts) {
+    MNN_ASSERT(0 < delta && delta <= 180);
+
+    float alpha, beta;
+    int i;
+
+    while(angle < 0) angle += 360;
+    while(angle > 360) angle -= 360;
+
+    if (arc_start > arc_end) {
+        i = arc_start;
+        arc_start = arc_end;
+        arc_end = i;
+    }
+    while (arc_start < 0) {
+        arc_start += 360;
+        arc_end += 360;
+    }
+    while (arc_end > 360) {
+        arc_end -= 360;
+        arc_start -= 360;
+    }
+    if (arc_end - arc_start > 360) {
+        arc_start = 0;
+        arc_end = 360;
+    }
+    sincos(angle, alpha, beta);
+    pts.resize(0);
+
+    for (i = arc_start; i < arc_end + delta; i += delta) {
+        double x, y;
+        angle = i;
+        if (angle > arc_end) angle = arc_end;
+        float sinv, cosv;
+        sincos(angle, sinv, cosv);
+        x = axes.width * cosv;
+        y = axes.height * sinv;
+        Point2d pt;
+        pt.x = center.x + x * alpha - y * beta;
+        pt.y = center.y + x * beta + y * alpha;
+        pts.push_back(pt);
+    }
+
+    // If there are no points, it's a zero-size polygon
+    if( pts.size() == 1) {
+        pts.assign(2,center);
+    }
+}
+static void ThickLine(std::vector<Region>& regions, Size size, Point2l p0, Point2l p1, int thickness, int line_type, int flags, int shift);
+static void PolyLine(std::vector<Region>& regions, Size size, const Point2l* v, int count, bool is_closed, int thickness, int line_type, int shift) {
+    if (!v || count <= 0) {
+        return;
+    }
+
+    int i = is_closed ? count - 1 : 0;
+    int flags = 2 + !is_closed;
+    Point2l p0;
+    MNN_ASSERT(0 <= shift && shift <= XY_SHIFT && thickness >= 0);
+
+    p0 = v[i];
+    for (i = !is_closed; i < count; i++) {
+        Point2l p = v[i];
+        ThickLine(regions, size, p0, p, thickness, line_type, flags, shift );
+        p0 = p;
+        flags = 2;
+    }
+}
+
+struct PolyEdge {
+    PolyEdge() : y0(0), y1(0), x(0), dx(0), next(0) {}
+
+    int y0, y1;
+    int64_t x, dx;
+    PolyEdge *next;
+};
+
+static void CollectPolyEdges(std::vector<Region>& regions, Size size, const Point2l* v, int count, std::vector<PolyEdge>& edges, int line_type, int shift, Point2i offset = Point2i()) {
+    int delta = offset.y + ((1 << shift) >> 1);
+    Point2l pt0 = v[count-1], pt1;
+    pt0.x = (pt0.x + offset.x) << (XY_SHIFT - shift);
+    pt0.y = (pt0.y + delta) >> shift;
+
+    edges.reserve(edges.size() + count);
+
+    for (int i = 0; i < count; i++, pt0 = pt1) {
+        Point2l t0, t1;
+        PolyEdge edge;
+
+        pt1 = v[i];
+        pt1.x = (pt1.x + offset.x) << (XY_SHIFT - shift);
+        pt1.y = (pt1.y + delta) >> shift;
+
+        t0.y = pt0.y; t1.y = pt1.y;
+        t0.x = (pt0.x + (XY_ONE >> 1)) >> XY_SHIFT;
+        t1.x = (pt1.x + (XY_ONE >> 1)) >> XY_SHIFT;
+        Line(regions, size, t0, t1, line_type);
+
+        if (pt0.y == pt1.y) continue;
+
+        if (pt0.y < pt1.y) {
+            edge.y0 = (int)(pt0.y);
+            edge.y1 = (int)(pt1.y);
+            edge.x = pt0.x;
+        } else {
+            edge.y0 = (int)(pt1.y);
+            edge.y1 = (int)(pt0.y);
+            edge.x = pt1.x;
+        }
+        edge.dx = (pt1.x - pt0.x) / (pt1.y - pt0.y);
+        edges.push_back(edge);
+    }
+}
+
+static void FillEdgeCollection(std::vector<Region>& regions, Size size, std::vector<PolyEdge>& edges) {
+    PolyEdge tmp;
+    int i, y, total = (int)edges.size();
+    PolyEdge* e;
+    int y_max = std::numeric_limits<int>::min(), y_min = std::numeric_limits<int>::max();
+    int64_t x_max = 0xFFFFFFFFFFFFFFFF, x_min = 0x7FFFFFFFFFFFFFFF;
+
+    if (total < 2) return;
+
+    for (i = 0; i < total; i++) {
+        PolyEdge& e1 = edges[i];
+        MNN_ASSERT(e1.y0 < e1.y1);
+        // Determine x-coordinate of the end of the edge.
+        // (This is not necessary x-coordinate of any vertex in the array.)
+        int64_t x1 = e1.x + (e1.y1 - e1.y0) * e1.dx;
+        y_min = std::min( y_min, e1.y0 );
+        y_max = std::max( y_max, e1.y1 );
+        x_min = std::min( x_min, e1.x );
+        x_max = std::max( x_max, e1.x );
+        x_min = std::min( x_min, x1 );
+        x_max = std::max( x_max, x1 );
+    }
+
+    if (y_max < 0 || y_min >= size.height || x_max < 0 || x_min >= ((int64_t)size.width<<XY_SHIFT)) return;
+
+    std::sort( edges.begin(), edges.end(), [](const PolyEdge& e1, const PolyEdge& e2) {
+        return e1.y0 - e2.y0 ? e1.y0 < e2.y0 : e1.x - e2.x ? e1.x < e2.x : e1.dx < e2.dx;
+    });
+
+    // start drawing
+    tmp.y0 = std::numeric_limits<int>::max();
+    edges.push_back(tmp); // after this point we do not add
+                          // any elements to edges, thus we can use pointers
+    i = 0;
+    tmp.next = 0;
+    e = &edges[i];
+    y_max = MIN(y_max, size.height);
+
+    for (y = e->y0; y < y_max; y++) {
+        PolyEdge *last, *prelast, *keep_prelast;
+        int sort_flag = 0;
+        int draw = 0;
+        int clipline = y < 0;
+
+        prelast = &tmp;
+        last = tmp.next;
+        while (last || e->y0 == y) {
+            if (last && last->y1 == y) {
+                // exclude edge if y reaches its lower point
+                prelast->next = last->next;
+                last = last->next;
+                continue;
+            }
+            keep_prelast = prelast;
+            if (last && (e->y0 > y || last->x < e->x)) {
+                // go to the next edge in active list
+                prelast = last;
+                last = last->next;
+            } else if(i < total) {
+                // insert new edge into active list if y reaches its upper point
+                prelast->next = e;
+                e->next = last;
+                prelast = e;
+                e = &edges[++i];
+            } else {
+                break;
+            }
+
+            if (draw) {
+                if(!clipline) {
+                    // convert x's from fixed-point to image coordinates
+                    // uchar *timg = const_cast<uchar*>(img->readMap<uchar>()) + (y * pix_size * w);
+                    int x1, x2;
+
+                    if (keep_prelast->x > prelast->x) {
+                        x1 = (int)((prelast->x + XY_ONE - 1) >> XY_SHIFT);
+                        x2 = (int)(keep_prelast->x >> XY_SHIFT);
+                    } else {
+                        x1 = (int)((keep_prelast->x + XY_ONE - 1) >> XY_SHIFT);
+                        x2 = (int)(prelast->x >> XY_SHIFT);
+                    }
+
+                    // clip and draw the line
+                    if( x1 < size.width && x2 >= 0 )
+                    {
+                        if (x1 < 0) x1 = 0;
+                        if (x2 >= size.width) x2 = size.width - 1;
+                        regions.emplace_back(Region{y, x1, x2});
+                    }
+                }
+                keep_prelast->x += keep_prelast->dx;
+                prelast->x += prelast->dx;
+            }
+            draw ^= 1;
+        }
+
+        // sort edges (using bubble sort)
+        keep_prelast = 0;
+        do {
+            prelast = &tmp;
+            last = tmp.next;
+
+            while (last != keep_prelast && last->next != 0) {
+                PolyEdge *te = last->next;
+                // swap edges
+                if (last->x > te->x) {
+                    prelast->next = te;
+                    last->next = te->next;
+                    te->next = last;
+                    prelast = te;
+                    sort_flag = 1;
+                } else {
+                    prelast = last;
+                    last = te;
+                }
+            }
+            keep_prelast = prelast;
+        } while(sort_flag && keep_prelast != tmp.next && keep_prelast != &tmp);
+    }
+}
+
+static void EllipseEx(std::vector<Region>& regions, Size size, Point2l center, Size2l axes, int angle, int arc_start, int arc_end, int thickness, int line_type) {
+    axes.width = std::abs(axes.width), axes.height = std::abs(axes.height);
+    int delta = (int)((std::max(axes.width,axes.height)+(XY_ONE>>1))>>XY_SHIFT);
+    delta = delta < 3 ? 90 : delta < 10 ? 30 : delta < 15 ? 18 : 5;
+
+    std::vector<Point2d> _v;
+    ellipse2Poly(Point2d((double)center.x, (double)center.y), Size2d((double)axes.width, (double)axes.height), angle, arc_start, arc_end, delta, _v);
+
+    std::vector<Point2l> v;
+    Point2l prevPt(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF);
+    v.resize(0);
+    for (unsigned int i = 0; i < _v.size(); ++i)
+    {
+        Point2l pt;
+        pt.x = (int64_t)std::round(_v[i].x / XY_ONE) << XY_SHIFT;
+        pt.y = (int64_t)std::round(_v[i].y / XY_ONE) << XY_SHIFT;
+        pt.x += std::round(_v[i].x - pt.x);
+        pt.y += std::round(_v[i].y - pt.y);
+        if (pt != prevPt) {
+            v.push_back(pt);
+            prevPt = pt;
+        }
+    }
+
+    // If there are no points, it's a zero-size polygon
+    if (v.size() == 1) {
+        v.assign(2, center);
+    }
+
+    if (thickness >= 0) {
+        PolyLine(regions, size, &v[0], (int)v.size(), false, thickness, line_type, XY_SHIFT);
+    } else if( arc_end - arc_start >= 360 ) {
+        FillConvexPoly(regions, size, &v[0], (int)v.size(), line_type, XY_SHIFT);
+    } else {
+        v.push_back(center);
+        std::vector<PolyEdge> edges;
+        CollectPolyEdges(regions, size, &v[0], (int)v.size(), edges, line_type, XY_SHIFT);
+        FillEdgeCollection(regions, size, edges);
+    }
+}
+
+static void Circle(std::vector<Region>& regions, Size size, Point2i center, int radius, int fill) {
+    int err = 0, dx = radius, dy = 0, plus = 1, minus = (radius << 1) - 1;
+    int inside = center.x >= radius && center.x < size.width - radius &&
+                 center.y >= radius && center.y < size.height - radius;
+
+    while (dx >= dy) {
+        int mask;
+        int y11 = center.y - dy, y12 = center.y + dy, y21 = center.y - dx, y22 = center.y + dx;
+        int x11 = center.x - dx, x12 = center.x + dx, x21 = center.x - dy, x22 = center.x + dy;
+
+        if (inside) {
+            if(!fill) {
+                regions.emplace_back(Region{y11, x11});
+                regions.emplace_back(Region{y12, x11});
+                regions.emplace_back(Region{y11, x12});
+                regions.emplace_back(Region{y12, x12});
+                regions.emplace_back(Region{y21, x21});
+                regions.emplace_back(Region{y22, x21});
+                regions.emplace_back(Region{y21, x22});
+                regions.emplace_back(Region{y22, x22});
+            } else {
+                regions.emplace_back(Region{y11, x11, x12});
+                regions.emplace_back(Region{y12, x11, x12});
+                regions.emplace_back(Region{y21, x21, x22});
+                regions.emplace_back(Region{y22, x21, x22});
+            }
+        } else if (x11 < size.width && x12 >= 0 && y21 < size.height && y22 >= 0) {
+            if (fill) {
+                x11 = std::max(x11, 0);
+                x12 = MIN(x12, size.width - 1);
+            }
+            if ((unsigned)y11 < (unsigned)size.height) {
+                if (!fill) {
+                    if(x11 >= 0) regions.emplace_back(Region{y11, x11});
+                    if(x12 < size.width) regions.emplace_back(Region{y11, x12});
+                } else {
+                    regions.emplace_back(Region{y11, x11, x12});
+                }
+            }
+            if ((unsigned)y12 < (unsigned)size.height) {
+                if(!fill) {
+                    if(x11 >= 0) regions.emplace_back(Region{y12, x11});
+                    if(x12 < size.width) regions.emplace_back(Region{y12, x12});
+                } else {
+                   regions.emplace_back(Region{y12, x11, x12});
+                }
+            }
+
+            if (x21 < size.width && x22 >= 0) {
+                if (fill) {
+                    x21 = std::max(x21, 0);
+                    x22 = MIN(x22, size.width - 1);
+                }
+                if ((unsigned)y21 < (unsigned)size.height) {
+                    if(!fill) {
+                        if(x21 >= 0) regions.emplace_back(Region{y21, x21});
+                        if(x22 < size.width) regions.emplace_back(Region{y21, x22});
+                    } else {
+                       regions.emplace_back(Region{y21, x21, x22});
+                    }
+                }
+                if ((unsigned)y22 < (unsigned)size.height) {
+                    if(!fill) {
+                        if(x21 >= 0) regions.emplace_back(Region{y22, x21});
+                        if(x22 < size.width) regions.emplace_back(Region{y22, x22});
+                    } else {
+                       regions.emplace_back(Region{y22, x21, x22});
+                    }
+                }
+            }
+        }
+        dy++;
+        err += plus;
+        plus += 2;
+        mask = (err <= 0) - 1;
+        err -= minus & mask;
+        dx += mask;
+        minus -= mask & 2;
+    }
+}
+
+static void ThickLine(std::vector<Region>& regions, Size size, Point2l p0, Point2l p1, int thickness, int line_type, int flags, int shift) {
+    constexpr double INV_XY_ONE = 1./XY_ONE;
+    p0.x <<= XY_SHIFT - shift;
+    p0.y <<= XY_SHIFT - shift;
+    p1.x <<= XY_SHIFT - shift;
+    p1.y <<= XY_SHIFT - shift;
+
+    if(thickness <= 1) {
+        if (line_type == 1 || line_type == 4 || shift == 0) {
+            p0.x = (p0.x + (XY_ONE>>1)) >> XY_SHIFT;
+            p0.y = (p0.y + (XY_ONE>>1)) >> XY_SHIFT;
+            p1.x = (p1.x + (XY_ONE>>1)) >> XY_SHIFT;
+            p1.y = (p1.y + (XY_ONE>>1)) >> XY_SHIFT;
+            Line(regions, size, p0, p1, line_type);
+        } else {
+            Line2(regions, size, p0, p1);
+        }
+    } else {
+        Point2l pt[4], dp = Point2i(0,0);
+        double dx = (p0.x - p1.x)*INV_XY_ONE, dy = (p1.y - p0.y)*INV_XY_ONE;
+        double r = dx * dx + dy * dy;
+        int i, oddThickness = thickness & 1;
+        thickness <<= XY_SHIFT - 1;
+
+        if( fabs(r) > 2.2e-16 ) {
+            r = (thickness + oddThickness * XY_ONE * 0.5) / std::sqrt(r);
+            dp.x = std::round( dy * r );
+            dp.y = std::round( dx * r );
+
+            pt[0].x = p0.x + dp.x;
+            pt[0].y = p0.y + dp.y;
+            pt[1].x = p0.x - dp.x;
+            pt[1].y = p0.y - dp.y;
+            pt[2].x = p1.x - dp.x;
+            pt[2].y = p1.y - dp.y;
+            pt[3].x = p1.x + dp.x;
+            pt[3].y = p1.y + dp.y;
+            FillConvexPoly(regions, size, pt, 4, line_type, XY_SHIFT);
+        }
+
+        for(i = 0; i < 2; i++) {
+            if(flags & (i+1)) {
+                Point2i center;
+                center.x = (int)((p0.x + (XY_ONE>>1)) >> XY_SHIFT);
+                center.y = (int)((p0.y + (XY_ONE>>1)) >> XY_SHIFT);
+                Circle(regions, size, center, (thickness + (XY_ONE>>1)) >> XY_SHIFT, 1);
+            }
+            p0 = p1;
+        }
+    }
+}
+
+template <typename T> static inline
+void scalarToRawData_(const Scalar& s, T * const buf, const int cn) {
+    for(int i = 0; i < cn; i++) {
+        buf[i] = static_cast<T>(s.val[i]);
+    }
+}
+
+void scalarToRawData(const Scalar& s, void* buf, VARP img) {
+    auto type = img->getInfo()->type;
+    int cn = getVARPChannel(img);
+    if (type == halide_type_of<uint8_t>()) {
+        scalarToRawData_<uchar>(s, (uchar*)buf, cn);
+    } else if (type == halide_type_of<float>()) {
+        scalarToRawData_<float>(s, (float*)buf, cn);
+    } else if (type == halide_type_of<double>()) {
+        scalarToRawData_<double>(s, (double*)buf, cn);
+    } else if (type == halide_type_of<int>()) {
+        scalarToRawData_<int>(s, (int*)buf, cn);
+    }
+}
+
+std::vector<Region> mergeRegions(std::vector<Region> regions) {
+    std::vector<Region> res;
+    // 1. get line's region
+    std::map<int, std::vector<std::pair<int, int>>> lines;
+    for (auto region : regions) {
+        if (lines.find(region.y) != lines.end()) {
+            lines[region.y].push_back({region.xl, region.xr});
+        } else {
+            lines[region.y] = std::vector<std::pair<int, int>>();
+            lines[region.y].push_back({region.xl, region.xr});
+        }
+    }
+    // 2. merge line's region
+    for (auto line : lines) {
+        auto liner = line.second;
+        // sort line regions
+        std::sort(liner.begin(), liner.end(), [](const std::pair<int, int>& a, const std::pair<int, int>& b){return a.first < b.first;});
+        // merge
+        res.emplace_back(Region{line.first, liner[0].first, liner[0].second});
+        for (int i = 1; i < liner.size(); i++) {
+            if (res.back().xr >= liner[i].second) {
+                res.back().xr = MAX(res.back().xr, liner[i].second);
+            } else {
+                res.emplace_back(Region{line.first, liner[i].first, liner[i].second});
+            }
+        }
+    }
+    return res;
+}
+
+void doDraw(VARP& img, const std::vector<Region>& regions, const Scalar& color) {
+    double buf[4];
+    scalarToRawData(color, buf, img);
+    auto mergeRegs = mergeRegions(regions);
+    ImageProcess::Config config;
+    config.draw = true;
+    std::unique_ptr<ImageProcess> process(ImageProcess::create(config));
+    int h, w, c; getVARPSize(img, &h, &w, &c);
+    process->draw(img->writeMap<uint8_t>(), w, h, c, reinterpret_cast<const int*>(mergeRegs.data()), mergeRegs.size(), (uint8_t*)buf);
+
 }
 
 void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color,
@@ -76,17 +854,32 @@ void arrowedLine(VARP& img, Point pt1, Point pt2, const Scalar& color,
     line(img, p, pt2, color, thickness, line_type, shift);
 }
 
+void circle(VARP& img, Point center, int radius, const Scalar& color, int thickness, int line_type, int shift) {
+    Point2i center_(static_cast<int>(center.fX), static_cast<int>(center.fY));
+    int h, w, c; getVARPSize(img, &h, &w, &c);
+    Size size(w, h);
+    std::vector<Region> regions;
+    if( thickness > 1 || line_type != LINE_8 || shift > 0 ) {
+        Point2l _center(center_);
+        int64_t _radius(radius);
+        _center.x <<= XY_SHIFT - shift;
+        _center.y <<= XY_SHIFT - shift;
+        _radius <<= XY_SHIFT - shift;
+        EllipseEx(regions, size, _center, Size2l(_radius, _radius), 0, 0, 360, thickness, line_type);
+    } else {
+        Circle(regions, size, center_, radius, thickness < 0);
+    }
+    doDraw(img, regions, color);
+}
+
 void line(VARP& img, Point pt1, Point pt2, const Scalar& color,
           int thickness, int lineType, int shift) {
-    int h = 0, w = 0, c = 0;
-    getVARPSize(img, &h, &w, &c);
-    auto ptr = img->writeMap<uint8_t>();
-    int x1 = static_cast<int>(pt1.fX), y1 = static_cast<int>(pt1.fY);
-    int x2 = static_cast<int>(pt2.fX), y2 = static_cast<int>(pt2.fY);
-    for (int i = 0; i < thickness; i++) {
-        // bresenham(ptr, h, w, c, x1[i], y1[i], x2[i], y2[i], color);
-    }
-    bresenham(ptr, h, w, c, x1, y1, x2, y2, color);
+    int h, w, c; getVARPSize(img, &h, &w, &c);
+    Point2i p1(static_cast<int>(pt1.fX), static_cast<int>(pt1.fY));
+    Point2i p2(static_cast<int>(pt2.fX), static_cast<int>(pt2.fY));
+    std::vector<Region> regions;
+    ThickLine(regions, Size{w, h}, p1, p2, thickness, lineType, 3, shift);
+    doDraw(img, regions, color);
 }
 
 void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color,
@@ -101,5 +894,77 @@ void rectangle(VARP& img, Point pt1, Point pt2, const Scalar& color,
     line(img, {pt1.fX, pt2.fY}, pt2, color, thickness, lineType);
 }
 
+void drawContours(VARP& img, std::vector<std::vector<Point>> _contours, int contourIdx, const Scalar& color, int thickness, int lineType) {
+    size_t ncontours = _contours.size();
+    if (!ncontours) return;
+    int h, w, c; getVARPSize(img, &h, &w, &c);
+    Size size(w, h);
+    std::vector<Region> regions;
+    size_t i = 0, first = 0, last = ncontours;
+    if (contourIdx >= 0) {
+        first = contourIdx;
+        last = first + 1;
+    }
+    std::vector<PolyEdge> edges;
+    for (i = first; i < last; i++) {
+        const auto& contour = _contours[i];
+        if (contour.empty()) continue;
+        std::vector<Point2l> pts;
+        for (int j = 0; j < contour.size(); j++) {
+            int nextj = j + 1 == contour.size() ? 0 : j + 1;
+            Point2l pt1(contour[j].fX, contour[j].fY), pt2(contour[nextj].fX, contour[nextj].fY);
+            if(thickness >= 0) {
+                ThickLine(regions, size, pt1, pt2, thickness, lineType, 2, 0);
+            } else {
+                if (!j) pts.push_back(pt1);
+                pts.push_back(pt2);
+            }
+        }
+        if (thickness < 0) {
+            CollectPolyEdges(regions, size, &pts[0], (int)pts.size(), edges, lineType, 0);
+        }
+    }
+    if (thickness < 0) {
+        FillEdgeCollection(regions, size, edges);
+    }
+    doDraw(img, regions, color);
+}
+
+void fillPoly(VARP& img,  std::vector<std::vector<Point>> _pts, const Scalar& color, int line_type, int shift, Point _offset) {
+    int ncontours = _pts.size();
+    if (!ncontours) return;
+    int h, w, c;
+    getVARPSize(img, &h, &w, &c);
+    Size size(w, h);
+    std::vector<Region> regions;
+    std::vector<std::vector<Point2i>> pts(ncontours);
+    std::vector<Point2i*> _ptsptr(ncontours);
+    std::vector<int> _npts(ncontours);
+    Point2i** ptsptr = _ptsptr.data();
+    int *npts = _npts.data(), total = 0;
+    for(int i = 0; i < ncontours; i++ ) {
+        int num = _pts[i].size();
+        pts[i].resize(num);
+        for (int j = 0; j < num; j++) {
+            pts[i][j].x = _pts[i][j].fX;
+            pts[i][j].y = _pts[i][j].fY;
+        }
+        ptsptr[i] = pts[i].data();
+        npts[i] = num;
+        total += num;
+    }
+    if(line_type == LINE_AA && img->getInfo()->type == halide_type_of<uint8_t>()) line_type = 8;
+    MNN_ASSERT(ptsptr && npts && ncontours >= 0 && 0 <= shift && shift <= XY_SHIFT);
+    std::vector<PolyEdge> edges;
+    Point2i offset(_offset.fX, _offset.fY);
+    edges.reserve( total + 1 );
+    for (int i = 0; i < ncontours; i++) {
+        std::vector<Point2l> _pts(ptsptr[i], ptsptr[i] + npts[i]);
+        CollectPolyEdges(regions, size, _pts.data(), npts[i], edges, line_type, shift, offset);
+    }
+    FillEdgeCollection(regions, size, edges);
+    doDraw(img, regions, color);
+}
+
 } // CV
 } // MNN
diff --git a/tools/cv/source/imgproc/filter.cpp b/tools/cv/source/imgproc/filter.cpp
index 0695c32a..328b7c5a 100644
--- a/tools/cv/source/imgproc/filter.cpp
+++ b/tools/cv/source/imgproc/filter.cpp
@@ -136,7 +136,7 @@ VARP dilate(VARP src, VARP kernel, int iterations, int borderType) {
     int kheight, kwidth, kchannel;
     getVARPSize(kernel, &kheight, &kwidth, &kchannel);
     auto padSrc = PadForConv(src, kheight, kwidth, borderType);
-    return _Squeeze(_MaxPool(padSrc, {3, 3}), {0});
+    return _Squeeze(_MaxPool(padSrc, {kheight, kwidth}), {0});
 }
 
 VARP filter2D(VARP src, int ddepth, VARP kernel, double delta, int borderType) {
diff --git a/tools/cv/source/imgproc/geometric.cpp b/tools/cv/source/imgproc/geometric.cpp
index 1ee1f340..4c63b505 100644
--- a/tools/cv/source/imgproc/geometric.cpp
+++ b/tools/cv/source/imgproc/geometric.cpp
@@ -56,8 +56,12 @@ Matrix getRotationMatrix2D(Point center, double angle, double scale) {
     return M;
 }
 
-VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation) {
+extern std::pair<CV::ImageFormat, CV::ImageFormat> getSrcDstFormat(int code);
+extern int format2Channel(CV::ImageFormat format);
+
+VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation, int code, std::vector<float> mean, std::vector<float> norm) {
     int ih, iw, ic;
+    auto type = src->getInfo()->type;
     getVARPSize(src, &ih, &iw, &ic);
     int oh = dsize.height, ow = dsize.width;
     if (!oh && !ow) {
@@ -66,30 +70,55 @@ VARP resize(VARP src, Size dsize, double fx, double fy, int interpolation) {
     }
     fx = static_cast<float>(iw) / ow;
     fy = static_cast<float>(ih) / oh;
-    auto dest = Tensor::create({1, oh, ow, ic}, halide_type_of<uint8_t>());
     ImageProcess::Config config;
+    // cvtColor
+    int oc = ic;
+    if (code >= 0) {
+        auto format = getSrcDstFormat(code);
+        config.sourceFormat = format.first;
+        config.destFormat = format.second;
+        oc = format2Channel(format.second);
+    } else {
+        ImageFormat format = RGB;
+        if (ic == 1) {
+            format = GRAY;
+        } else if (ic == 4) {
+            format = RGBA;
+        }
+        config.sourceFormat = format;
+        config.destFormat = format;
+    }
+    // toFloat
+    auto dstType = type;
+    if (!mean.empty() || !norm.empty()) {
+        for (int i = 0; i < mean.size() && i < 4; i++) {
+            config.mean[i] = mean[i];
+        }
+        for (int i = 0; i < norm.size() && i < 4; i++) {
+            config.normal[i] = norm[i];
+        }
+        dstType = halide_type_of<float>();
+    }
     config.filterType = static_cast<Filter>(interpolation);
-    config.sourceFormat = RGB;
-    config.destFormat = RGB;
     std::unique_ptr<ImageProcess> process(ImageProcess::create(config));
+    auto dest = Tensor::create({1, oh, ow, oc}, dstType);
     Matrix tr;
     tr.postScale(fx, fy);
     tr.postTranslate(0.5 * (fx - 1), 0.5 * (fy - 1));
     process->setMatrix(tr);
-    process->convert(src->readMap<uint8_t>(), iw, ih, 0, dest->host<uint8_t>(), ow, oh, ic, 0, halide_type_of<uint8_t>());
+    process->convert(src->readMap<uint8_t>(), iw, ih, 0, dest->host<uint8_t>(), ow, oh, oc, 0, dstType);
     auto res = Express::Variable::create(Express::Expr::create(dest, true), 0);
     return _Squeeze(res, {0});
 }
 
-VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int borderValue) {
+VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int borderValue, int code, std::vector<float> mean, std::vector<float> norm) {
     int ih, iw, ic;
+    auto type = src->getInfo()->type;
     getVARPSize(src, &ih, &iw, &ic);
     int oh = dsize.height, ow = dsize.width;
-    auto dest = Tensor::create({1, oh, ow, ic}, halide_type_of<uint8_t>());
+    // auto dest = Tensor::create({1, oh, ow, ic}, type);
     ImageProcess::Config config;
     config.filterType = flags < 3 ? static_cast<Filter>(flags) : BILINEAR;
-    config.sourceFormat = RGB;
-    config.destFormat = RGB;
     switch (borderMode) {
         case BORDER_CONSTANT:
             config.wrap = ZERO;
@@ -104,6 +133,35 @@ VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int b
             MNN_ERROR("Don't support borderMode!");
             break;
     }
+    // cvtColor
+    int oc = ic;
+    if (code >= 0) {
+        auto format = getSrcDstFormat(code);
+        config.sourceFormat = format.first;
+        config.destFormat = format.second;
+        oc = format2Channel(format.second);
+    } else {
+        ImageFormat format = RGB;
+        if (ic == 1) {
+            format = GRAY;
+        } else if (ic == 4) {
+            format = RGBA;
+        }
+        config.sourceFormat = format;
+        config.destFormat = format;
+    }
+    // toFloat
+    auto dstType = type;
+    if (!mean.empty() || !norm.empty()) {
+        for (int i = 0; i < mean.size() && i < 4; i++) {
+            config.mean[i] = mean[i];
+        }
+        for (int i = 0; i < norm.size() && i < 4; i++) {
+            config.normal[i] = norm[i];
+        }
+        dstType = halide_type_of<float>();
+    }
+    auto dest = Tensor::create({1, oh, ow, oc}, dstType);
     std::unique_ptr<ImageProcess> process(ImageProcess::create(config));
     if (flags != WARP_INVERSE_MAP) {
         bool invert = M.invert(&M);
@@ -111,7 +169,7 @@ VARP warpAffine(VARP src, Matrix M, Size dsize, int flags, int borderMode, int b
     }
     process->setMatrix(M);
     process->setPadding(borderValue);
-    process->convert(src->readMap<uint8_t>(), iw, ih, 0, dest->host<uint8_t>(), ow, oh, ic, 0, halide_type_of<uint8_t>());
+    process->convert(src->readMap<uint8_t>(), iw, ih, 0, dest->host<uint8_t>(), ow, oh, oc, 0, dstType);
     auto res = Express::Variable::create(Express::Expr::create(dest, true), 0);
     return _Squeeze(res, {0});
 }
diff --git a/tools/cv/source/imgproc/structural.cpp b/tools/cv/source/imgproc/structural.cpp
index de8b6162..b6d4e2f3 100644
--- a/tools/cv/source/imgproc/structural.cpp
+++ b/tools/cv/source/imgproc/structural.cpp
@@ -111,7 +111,7 @@ static CvContourScanner cvStartFindContours( VARP _img, CvPoint offset, int mode
     return scanner;
 }
 
-static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int method, POINTS& points)
+static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int method, std::vector<CvPoint>& points)
 {
     const char     nbd = 2;
     int             deltas[16];
@@ -119,7 +119,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int
     /* initialize local state */
     CV_INIT_3X3_DELTAS( deltas, step, 1);
     ::memcpy( deltas + 8, deltas, 8 * sizeof( deltas[0] ));
-    char           *i0 = (ptr), *i1, *i3, *i4 = 0;
+    schar  *i0 = (ptr), *i1, *i3, *i4 = 0;
     s_end = s = is_hole ? 0 : 4;
 
     do
@@ -136,9 +136,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int
         *i0 = (schar) (nbd | -128);
         if( method >= 0 )
         {
-            Point _p;
-            _p.set(pt.x, pt.y);
-            points.push_back(_p);
+            points.push_back(pt);
         }
     }
     else
@@ -172,9 +170,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int
             {
                 if( s != prev_s || method == 1 )
                 {
-                    Point _p;
-                    _p.set(pt.x, pt.y);
-                    points.push_back(_p);
+                    points.push_back(pt);
                     prev_s = s;
                 }
 
@@ -192,7 +188,7 @@ static void icvFetchContour(schar* ptr, int step, CvPoint pt, bool is_hole, int
     }
 }
 
-static bool cvFindNextContour(CvContourScanner scanner, POINTS& points)
+static bool cvFindNextContour(CvContourScanner scanner, std::vector<CvPoint>& points)
 {
     /* initialize local state */
     schar* img0 = scanner->img0;
@@ -368,7 +364,11 @@ static int Sklansky_( Point_<_Tp>** array, int start, int end, int* stack, int n
 enum { CALIPERS_MAXHEIGHT=0, CALIPERS_MINAREARECT=1, CALIPERS_MAXDIST=2 };
 static void rotatingCalipers( const Point2f* points, int n, int mode, float* out )
 {
+#ifdef _MSC_VER
+    float minarea = FLT_MAX;
+#else
     float minarea = __FLT_MAX__;
+#endif
     float max_dist = 0;
     char buffer[32] = {};
     int i, k;
@@ -2058,40 +2058,61 @@ LabelT LabelingGrana(VARP img, VARP& imgLabels, int connectivity, CCStatsOp& sop
 }
 /*Copy From OpenCV End*/
 
-std::vector<POINTS> findContours(VARP image, int mode, int method, Point offset) {
+std::vector<VARP> findContours(VARP image, int mode, int method, Point offset) {
     if (method > CHAIN_APPROX_SIMPLE) {
         MNN_ERROR("findContours: just support method = [CHAIN_APPROX_NONE, CHAIN_APPROX_SIMPLE].");
     }
     auto img = _Clone(image, true);
     CvPoint off((int)offset.fX, (int)offset.fY);
     auto info = cvStartFindContours(img, off, mode, method);
-    POINTS points;
-    std::vector<POINTS> contours;
+    std::vector<CvPoint> points;
+    std::vector<VARP> contours;
     while (cvFindNextContour(info, points)) {
-        contours.emplace_back(std::move(points));
+        auto ptr = reinterpret_cast<int*>(points.data());
+        contours.push_back(_Const(ptr, {static_cast<int>(points.size()), 1, 2}, NHWC, halide_type_of<int>()));
+        points.clear();
     }
     // same to opencv
     std::reverse(contours.begin(), contours.end());
     delete info;
     return contours;
 }
-double contourArea(std::vector<Point> _contour, bool oriented) {
-    int npoints = _contour.size();
+double contourArea(VARP _contour, bool oriented) {
+    auto info = _contour->getInfo();
+    int npoints = info->size / 2;
     if (!npoints) return 0;
+    bool is_float = info->type == halide_type_of<float>();
+    bool is_int   = info->type == halide_type_of<int>();
+    MNN_ASSERT(is_float || is_int);
     double a00 = 0;
-    auto prev = _contour.back();
-    for(int i = 0; i < npoints; i++) {
-        auto p = _contour[i];
-        a00 += (double)prev.fX * p.fY - (double)prev.fY * p.fX;
-        prev = p;
+    float prevx, prevy;
+    if (is_float) {
+        auto ptr = _contour->readMap<float>();
+        prevx = ptr[npoints * 2 - 2], prevy = ptr[npoints * 2 - 1];
+        for(int i = 0; i < npoints; i++) {
+            auto x = ptr[i * 2], y = ptr[i * 2 + 1];
+            a00 += (double)prevx * y - (double)prevy * x;
+            prevx = x, prevy = y;
+        }
+    } else {
+        auto ptr = _contour->readMap<int>();
+        prevx = ptr[npoints * 2 - 2], prevy = ptr[npoints * 2 - 1];
+        for(int i = 0; i < npoints; i++) {
+            float x = ptr[i * 2], y = ptr[i * 2 + 1];
+            a00 += (double)prevx * y - (double)prevy * x;
+            prevx = x, prevy = y;
+        }
     }
+
     a00 *= 0.5;
     if(!oriented) a00 = fabs(a00);
     return a00;
 }
 
-std::vector<int> convexHull(std::vector<Point> points, bool clockwise, bool returnPoints) {
-    int i, total = points.size(), nout = 0;
+std::vector<int> convexHull(VARP points, bool clockwise, bool returnPoints) {
+    auto info = points->getInfo();
+    auto pointPtr = points->readMap<int>();
+    int i, total = info->size / 2, nout = 0;
     int miny_ind = 0, maxy_ind = 0;
     std::vector<int> _hull;
     if( total == 0 )
@@ -2105,8 +2126,8 @@ std::vector<int> convexHull(std::vector<Point> points, bool clockwise, bool retu
     int* stack = _stack.data();
     int* hullbuf = _hullbuf.data();
     for( i = 0; i < total; i++ ) {
-        _points[i].x = (int)points[i].fX;
-        _points[i].y = (int)points[i].fY;
+        _points[i].x = pointPtr[i * 2 + 0];
+        _points[i].y = pointPtr[i * 2 + 1];
         pointer[i] = reinterpret_cast<Point2i*>(&_points[i]);
     }
     Point2i* data0 = pointer[0];
@@ -2228,8 +2249,8 @@ std::vector<int> convexHull(std::vector<Point> points, bool clockwise, bool retu
     if( returnPoints ) {
         _hull.resize(nout * 2);
         for (int i = 0; i < nout; i++) {
-            _hull[2 * i] = (int)points[_hullbuf[i]].fX;
-            _hull[2 * i + 1] = (int)points[_hullbuf[i]].fY;
+            _hull[2 * i] = pointPtr[_hullbuf[i] * 2];
+            _hull[2 * i + 1] = pointPtr[_hullbuf[i] * 2 + 1];
         }
     } else {
         _hull.resize(nout);
@@ -2239,7 +2260,7 @@ std::vector<int> convexHull(std::vector<Point> points, bool clockwise, bool retu
     }
     return _hull;
 }
-RotatedRect minAreaRect(std::vector<Point> _points) {
+RotatedRect minAreaRect(VARP _points) {
     auto hull = convexHull(_points);
     int n = hull.size() / 2;
     Point2f out[3];
@@ -2271,30 +2292,34 @@ RotatedRect minAreaRect(std::vector<Point> _points) {
     box.angle = (float)(box.angle*180/MNN_PI);
     return box;
 }
-Rect2i boundingRect(POINTS points) {
-    int npoints = points.size();
-    int xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+Rect2i boundingRect(VARP points) {
+    auto info = points->getInfo();
+    int npoints = info->size / 2;
     if( npoints == 0 )
         return Rect2i();
-    Point pt = points[0];
-    xmin = xmax = pt.fX;
-    ymin = ymax = pt.fY;
+    bool is_float = info->type == halide_type_of<float>();
+    bool is_int   = info->type == halide_type_of<int>();
+    MNN_ASSERT(is_float || is_int);
+    int xmin = 0, ymin = 0, xmax = -1, ymax = -1;
+    auto iptr = points->readMap<int>();
+    auto fptr = points->readMap<float>();
+    xmin = xmax = is_float ? fptr[0] : iptr[0];
+    ymin = ymax = is_float ? fptr[1] : iptr[1];
+    for(int i = 1; i < npoints; i++) {
+        int x = is_float ? fptr[2 * i] : iptr[2 * i];
+        int y = is_float ? fptr[2 * i + 1] : iptr[2 * i + 1];
 
-    for( int i = 1; i < npoints; i++ )
-    {
-        pt = points[i];
+        if( xmin > x )
+            xmin = x;
 
-        if( xmin > pt.fX )
-            xmin = pt.fX;
+        if( xmax < x )
+            xmax = x;
 
-        if( xmax < pt.fX )
-            xmax = pt.fX;
+        if( ymin > y )
+            ymin = y;
 
-        if( ymin > pt.fY )
-            ymin = pt.fY;
-
-        if( ymax < pt.fY )
-            ymax = pt.fY;
+        if( ymax < y )
+            ymax = y;
     }
     return Rect2i(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1);
 }
@@ -2304,7 +2329,7 @@ int connectedComponentsWithStats(VARP image, VARP& labels, VARP& statsv, VARP& c
     return LabelingGrana<int, uchar>(image, labels, connectivity, sop);
 }
 
-POINTS boxPoints(RotatedRect box) {
+VARP boxPoints(RotatedRect box) {
     std::vector<Point> pt(4);
     double _angle = box.angle*MNN_PI/180.;
     float b = (float)cos(_angle)*0.5f;
@@ -2317,7 +2342,7 @@ POINTS boxPoints(RotatedRect box) {
     pt[2].fY = 2*box.center.y - pt[0].fY;
     pt[3].fX = 2*box.center.x - pt[1].fX;
     pt[3].fY = 2*box.center.y - pt[1].fY;
-    return pt;
+    return _Const(pt.data(), {4, 2});
 }
 
 } // CV
diff --git a/tools/cv/test/imgcodecs/codecs_test.cpp b/tools/cv/test/imgcodecs/codecs_test.cpp
index 921f0139..4a76e776 100644
--- a/tools/cv/test/imgcodecs/codecs_test.cpp
+++ b/tools/cv/test/imgcodecs/codecs_test.cpp
@@ -11,7 +11,6 @@
 #include "cv/imgcodecs.hpp"
 #include "test_env.hpp"
 
-#define MNN_CODECS_TEST
 #ifdef MNN_CODECS_TEST
 
 static Env<uint8_t> testEnv(img_name, false);
diff --git a/tools/cv/test/imgproc/color_test.cpp b/tools/cv/test/imgproc/color_test.cpp
index a8613011..66d47254 100644
--- a/tools/cv/test/imgproc/color_test.cpp
+++ b/tools/cv/test/imgproc/color_test.cpp
@@ -11,7 +11,6 @@
 #include <opencv2/imgcodecs.hpp>
 #include "test_env.hpp"
 
-#define MNN_TEST_COLOR
 #ifdef MNN_TEST_COLOR
 
 static Env<unsigned char> testEnv(img_name, false);
diff --git a/tools/cv/test/imgproc/draw_test.cpp b/tools/cv/test/imgproc/draw_test.cpp
index 471e56f2..f94e7f8d 100644
--- a/tools/cv/test/imgproc/draw_test.cpp
+++ b/tools/cv/test/imgproc/draw_test.cpp
@@ -11,36 +11,58 @@
 #include "cv/imgcodecs.hpp"
 #include "test_env.hpp"
 
-#define MNN_DRAW_TEST
 #ifdef MNN_DRAW_TEST
 
 static Env<uint8_t> testEnv(img_name, false);
 
-/*
 // arrowedLine
 TEST(arrowedLine, basic) {
-    cv::arrowedLine(testEnv.cvSrc, {10, 10}, {300, 200}, {0, 0, 255});
-    arrowedLine(testEnv.mnnSrc, {10, 10}, {300, 200}, {0, 0, 255});
-    // cv::imwrite("cv_line.jpg", testEnv.cvSrc);
-    // imwrite("mnn_line.jpg", testEnv.mnnSrc);
-    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
-}*/
-
-// line
-TEST(line, basic) {
-    cv::line(testEnv.cvSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1);
-    line(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1);
+    cv::arrowedLine(testEnv.cvSrc, {10, 10}, {300, 200}, {0, 0, 255}, 1);
+    arrowedLine(testEnv.mnnSrc, {10, 10}, {300, 200}, {0, 0, 255}, 1);
     EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
 }
 
-/*
-TEST(line, thickness) {
-    cv::line(testEnv.cvSrc, {10, 10}, {20, 20}, {0, 0, 255}, 1);
-    line(testEnv.mnnSrc, {10, 10}, {20, 20}, {0, 0, 255}, 1);
-    cv::imwrite("cv_line.jpg", testEnv.cvSrc);
-    imwrite("mnn_line.jpg", testEnv.mnnSrc);
+TEST(arrowedLine, thickness) {
+    cv::arrowedLine(testEnv.cvSrc, {10, 10}, {30, 20}, {0, 0, 255}, 5);
+    arrowedLine(testEnv.mnnSrc, {10, 10}, {30, 20}, {0, 0, 255}, 5);
     EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
-}*/
+}
+
+// circle
+TEST(circle, basic) {
+    cv::circle(testEnv.cvSrc, {50, 50}, 10, {0, 0, 255}, 1);
+    circle(testEnv.mnnSrc, {50, 50}, 10, {0, 0, 255}, 1);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
+
+TEST(circle, thickness) {
+    cv::circle(testEnv.cvSrc, {100, 100}, 10, {0, 0, 255}, 5);
+    circle(testEnv.mnnSrc, {100, 100}, 10, {0, 0, 255}, 5);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
+
+TEST(circle, fill) {
+    cv::circle(testEnv.cvSrc, {150, 150}, 10, {0, 0, 255}, -1);
+    circle(testEnv.mnnSrc, {150, 150}, 10, {0, 0, 255}, -1);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
+
+// line
+TEST(line, basic) {
+    // cv::line(testEnv.cvSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1);
+    // line(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1);
+    cv::line(testEnv.cvSrc, {10, 10}, {50, 50}, {0, 0, 255}, 1);
+    line(testEnv.mnnSrc, {10, 10}, {50, 50}, {0, 0, 255}, 1);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
+
+TEST(line, thickness) {
+    cv::line(testEnv.cvSrc, {10, 10}, {20, 20}, {0, 0, 255}, 5);
+    line(testEnv.mnnSrc, {10, 10}, {20, 20}, {0, 0, 255}, 5);
+    // cv::imwrite("cv_line.jpg", testEnv.cvSrc);
+    // imwrite("mnn_line.jpg", testEnv.mnnSrc);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
 
 // rectangle
 TEST(rectangle, basic) {
@@ -48,5 +70,44 @@ TEST(rectangle, basic) {
     rectangle(testEnv.mnnSrc, {10, 10}, {200, 300}, {0, 0, 255}, 1);
     EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
 }
+// drawContours
+TEST(drawContours, basic) {
+    cv::Mat gray, binary;
+    cv::cvtColor(testEnv.cvSrc, gray, cv::COLOR_BGR2GRAY);
+    cv::threshold(gray, binary, 127, 255, cv::THRESH_BINARY);
+    std::vector<std::vector<cv::Point>> cv_contours;
+    std::vector<cv::Vec4i> hierarchy;
+    cv::findContours(binary, cv_contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
+    cv::drawContours(testEnv.cvSrc, cv_contours, -1, {0, 0, 255}, -1);
+    std::vector<std::vector<Point>> mnn_contours(cv_contours.size());
+    for (int i = 0; i < cv_contours.size(); i++) {
+        for (int j = 0; j < cv_contours[i].size(); j++) {
+            Point p;
+            p.set(cv_contours[i][j].x, cv_contours[i][j].y);
+            mnn_contours[i].push_back(p);
+        }
+    }
+    drawContours(testEnv.mnnSrc, mnn_contours, -1, {0, 0, 255}, -1);
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
 
+TEST(fillPoly, basic) {
+    cv::Mat gray, binary;
+    cv::cvtColor(testEnv.cvSrc, gray, cv::COLOR_BGR2GRAY);
+    cv::threshold(gray, binary, 127, 255, cv::THRESH_BINARY);
+    std::vector<std::vector<cv::Point>> cv_contours;
+    std::vector<cv::Vec4i> hierarchy;
+    cv::findContours(binary, cv_contours, hierarchy, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE);
+    cv::fillPoly(testEnv.cvSrc, cv_contours, {0, 0, 255});
+    std::vector<std::vector<Point>> mnn_contours(cv_contours.size());
+    for (int i = 0; i < cv_contours.size(); i++) {
+        for (int j = 0; j < cv_contours[i].size(); j++) {
+            Point p;
+            p.set(cv_contours[i][j].x, cv_contours[i][j].y);
+            mnn_contours[i].push_back(p);
+        }
+    }
+    fillPoly(testEnv.mnnSrc, mnn_contours, {0, 0, 255});
+    EXPECT_TRUE(testEnv.equal(testEnv.cvSrc, testEnv.mnnSrc));
+}
 #endif
diff --git a/tools/cv/test/imgproc/filter_test.cpp b/tools/cv/test/imgproc/filter_test.cpp
index a0d6b25c..f5f98785 100644
--- a/tools/cv/test/imgproc/filter_test.cpp
+++ b/tools/cv/test/imgproc/filter_test.cpp
@@ -10,7 +10,6 @@
 #include <opencv2/imgproc/imgproc.hpp>
 #include "test_env.hpp"
 
-#define MNN_TEST_FILTER
 #ifdef MNN_TEST_FILTER
 static Env<float> testEnv(img_name, true);
 
diff --git a/tools/cv/test/imgproc/geometric_test.cpp b/tools/cv/test/imgproc/geometric_test.cpp
index 06b21c0a..811c33ac 100644
--- a/tools/cv/test/imgproc/geometric_test.cpp
+++ b/tools/cv/test/imgproc/geometric_test.cpp
@@ -10,7 +10,6 @@
 #include <opencv2/imgproc/imgproc.hpp>
 #include "test_env.hpp"
 
-#define MNN_GEOMETRIC_TEST
 #ifdef MNN_GEOMETRIC_TEST
 
 static Env<uint8_t> testEnv(img_name, false);
diff --git a/tools/cv/test/imgproc/miscellaneous_test.cpp b/tools/cv/test/imgproc/miscellaneous_test.cpp
index e80e185a..a429d1cf 100644
--- a/tools/cv/test/imgproc/miscellaneous_test.cpp
+++ b/tools/cv/test/imgproc/miscellaneous_test.cpp
@@ -10,7 +10,6 @@
 #include <opencv2/imgproc/imgproc.hpp>
 #include "test_env.hpp"
 
-#define MNN_MISCELLANEOUS_TEST
 #ifdef MNN_MISCELLANEOUS_TEST
 
 static Env<float> testEnv("img_name", true);
diff --git a/tools/cv/test/imgproc/structral_test.cpp b/tools/cv/test/imgproc/structral_test.cpp
index 1366a7cc..e633a279 100644
--- a/tools/cv/test/imgproc/structral_test.cpp
+++ b/tools/cv/test/imgproc/structral_test.cpp
@@ -10,7 +10,6 @@
 #include <opencv2/imgproc/imgproc.hpp>
 #include "test_env.hpp"
 
-#define MNN_STRUCTRAL_TEST
 #ifdef MNN_STRUCTRAL_TEST
 
 static Env<uint8_t> testEnv(img_name, false);
@@ -28,13 +27,14 @@ static std::vector<uint8_t> img = {
     0,0,0,0,0,0,0,0,0,0,0,0,0,
     0,0,0,0,0,0,0,0,0,0,0,0,0
 };
-static void cmpContours(std::vector<std::vector<Point>> x, std::vector<std::vector<cv::Point>> y) {
+static void cmpContours(std::vector<VARP> x, std::vector<std::vector<cv::Point>> y) {
     ASSERT_EQ(x.size(), y.size());
     for (int i = 0; i < x.size(); i++) {
-        ASSERT_EQ(x[i].size(), y[i].size());
-        for (int j = 0; j < x[i].size(); j++) {
-            ASSERT_EQ(x[i][j].fX, (float)y[i][j].x);
-            ASSERT_EQ(x[i][j].fY, (float)y[i][j].y);
+        ASSERT_EQ(x[i]->getInfo()->size / 2, y[i].size());
+        auto ptr = x[i]->readMap<int>();
+        for (int j = 0; j < y[i].size(); j++) {
+            ASSERT_EQ(ptr[j * 2 + 0], y[i][j].x);
+            ASSERT_EQ(ptr[j * 2 + 1], y[i][j].y);
         }
     }
 }
@@ -81,16 +81,8 @@ TEST(findContours, list_simple) {
 }
 
 TEST(contourArea, basic) {
-    std::vector<cv::Point> cv_contour;
-    cv_contour.push_back(cv::Point2i(0, 0));
-    cv_contour.push_back(cv::Point2i(10, 0));
-    cv_contour.push_back(cv::Point2i(10, 10));
-    cv_contour.push_back(cv::Point2i(5, 4));
-    std::vector<Point> mnn_contour;
-    mnn_contour.push_back({0, 0});
-    mnn_contour.push_back({10, 0});
-    mnn_contour.push_back({10, 10});
-    mnn_contour.push_back({5, 4});
+    std::vector<cv::Point2i> cv_contour = { {0, 0}, {10, 0}, {10, 10}, {5, 4}};
+    VARP mnn_contour = _Const(cv_contour.data(), {4, 2}, NHWC, halide_type_of<int>());
     double x = contourArea(mnn_contour);
     double y = cv::contourArea(cv_contour);
     ASSERT_EQ(x, y);
@@ -99,7 +91,7 @@ TEST(contourArea, basic) {
 #define TEST_POINTS { {0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3} }
 TEST(convexHull, indices) {
     std::vector<cv::Point> cv_contour = TEST_POINTS;
-    std::vector<Point> mnn_contour = TEST_POINTS;
+    VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of<int>());
     auto x = convexHull(mnn_contour, false, false);
     std::vector<int> y;
     cv::convexHull(cv_contour, y, false, false);
@@ -107,7 +99,7 @@ TEST(convexHull, indices) {
 }
 TEST(convexHull, pointers) {
     std::vector<cv::Point> cv_contour = TEST_POINTS;
-    std::vector<Point> mnn_contour = TEST_POINTS;
+    VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of<int>());
     auto x = convexHull(mnn_contour, false, true);
     cv::Mat y = cv::Mat(1, 4, CV_32S);
     cv::convexHull(cv_contour, y, false, true);
@@ -117,7 +109,7 @@ TEST(convexHull, pointers) {
 }
 TEST(minAreaRect, basic) {
     std::vector<cv::Point> cv_contour = TEST_POINTS;
-    std::vector<Point> mnn_contour = TEST_POINTS;
+    VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of<int>());
     auto x = minAreaRect(mnn_contour);
     auto y = cv::minAreaRect(cv_contour);
     ASSERT_NEAR(x.center.x, y.center.x, 1e-4);
@@ -132,7 +124,7 @@ TEST(minAreaRect, basic) {
 }
 TEST(boundingRect, basic) {
     std::vector<cv::Point> cv_contour = TEST_POINTS;
-    std::vector<Point> mnn_contour = TEST_POINTS;
+    VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of<int>());
     auto x = boundingRect(mnn_contour);
     auto y = cv::boundingRect(cv_contour);
     ASSERT_EQ(x.x, y.x);
@@ -155,17 +147,20 @@ TEST(connectedComponentsWithStats, basic) {
 }
 TEST(boxPoints, basic) {
     std::vector<cv::Point> cv_contour = TEST_POINTS;
-    std::vector<Point> mnn_contour = TEST_POINTS;
+    VARP mnn_contour = _Const(cv_contour.data(), {8, 2}, NHWC, halide_type_of<int>());
     auto x = minAreaRect(mnn_contour);
     auto y = cv::minAreaRect(cv_contour);
-    auto mnn_points = boxPoints(x);
+    auto _mnn_points = boxPoints(x);
     cv::Mat _cv_points;
     cv::boxPoints(y, _cv_points);
-    auto ptr = reinterpret_cast<float*>(_cv_points.data);
-    std::vector<Point> cv_points(4);
+    auto cvptr = reinterpret_cast<float*>(_cv_points.data);
+    auto mnnptr = _mnn_points->readMap<float>();
+    std::vector<Point> cv_points(4), mnn_points(4);
     for (int i = 0; i < 4; i++) {
-        cv_points[i].fX = ptr[2 * i + 0];
-        cv_points[i].fY = ptr[2 * i + 1];
+        cv_points[i].fX = cvptr[2 * i + 0];
+        cv_points[i].fY = cvptr[2 * i + 1];
+        mnn_points[i].fX = mnnptr[2 * i + 0];
+        mnn_points[i].fY = mnnptr[2 * i + 1];
     }
     auto comp = [](Point p1, Point p2) { return p1.fX < p2.fX; };
     std::sort(mnn_points.begin(), mnn_points.end(), comp);
diff --git a/tools/cv/test/test_env.hpp b/tools/cv/test/test_env.hpp
index 474c5c42..ee4c3257 100644
--- a/tools/cv/test/test_env.hpp
+++ b/tools/cv/test/test_env.hpp
@@ -9,6 +9,16 @@
 #ifndef TEST_ENV_HPP
 #define TEST_ENV_HPP
 
+// macro flags for module test
+#define MNN_CODECS_TEST
+#define MNN_TEST_COLOR
+#define MNN_DRAW_TEST
+#define MNN_TEST_FILTER
+#define MNN_GEOMETRIC_TEST
+#define MNN_MISCELLANEOUS_TEST
+#define MNN_STRUCTRAL_TEST
+#define MNN_DRAW_TEST
+
 #include <iostream>
 #include <opencv2/core/core.hpp>
 #include <opencv2/imgcodecs.hpp>
diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp
index c55de77c..8e76a05f 100644
--- a/tools/quantization/calibration.cpp
+++ b/tools/quantization/calibration.cpp
@@ -959,7 +959,7 @@ void Calibration::_quantizeModelEMA() {
     model->setIsTraining(false);
     exe->gc(Executor::PART);
     VARP forwardInput = nullptr;
-    if (originInfo != nullptr) {
+    if (originInfo != nullptr && originDims.size() > 0) {
         forwardInput = _Input(originDims, originFormat, originType);
     } else {
         if (_inputType == Helper::InputType::IMAGE) {
diff --git a/tools/script/formatLicence.py b/tools/script/formatLicence.py
index 3723589b..8da0bd64 100644
--- a/tools/script/formatLicence.py
+++ b/tools/script/formatLicence.py
@@ -13,7 +13,7 @@ ignore_files = [
     "CPUFixedPoint.hpp", "OptimizedComputer.hpp", "OptimizedComputer.cpp",
     "AllShader.h", "AllShader.cpp", "VulkanShaderMap.cpp"
     ]
-all_exts = [".c", ".cpp", ".h", ".hpp", ".m", ".mm", ".s", ".metal"]
+all_exts = [".c", ".cpp", ".h", ".hpp", ".m", ".mm", ".s", ".metal", ".cuh", '.cu']
 
 header_template = \
 "//\n"                                                  + \
diff --git a/tools/script/modelTest.py b/tools/script/modelTest.py
index 5cfda8ff..165e6a7a 100755
--- a/tools/script/modelTest.py
+++ b/tools/script/modelTest.py
@@ -18,7 +18,7 @@ if len(sys.argv) > 5:
     runStatic = True
 gWrong = []
 
-convert = './MNNConvert -f MNN --bizCode MNN --saveStaticModel --modelFile '
+convert = ('MNNConvert.exe' if os.name == 'nt' else './MNNConvert') + ' -f MNN --bizCode MNN --saveStaticModel --modelFile '
 tmpModel = '__tmpModel__.mnn'
 dynamic_size = 0
 static_size = 0
diff --git a/tools/train/source/nn/NN.cpp b/tools/train/source/nn/NN.cpp
index deb9a2f8..560058b4 100644
--- a/tools/train/source/nn/NN.cpp
+++ b/tools/train/source/nn/NN.cpp
@@ -11,6 +11,7 @@
 #include "module/PipelineModule.hpp"
 #include "module/WhileModule.hpp"
 #include "module/IfModule.hpp"
+#include "module/NMSModule.hpp"
 #include "Initializer.hpp"
 #include "MNN_generated.h"
 #include "RandomGenerator.hpp"
@@ -528,6 +529,9 @@ Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::
     if (expr->get()->type() == OpType_If) {
         return IfModule::create(expr->get(), subgraphs);
     }
+    if (expr->get()->type() == OpType_NonMaxSuppressionV2) {
+        return NMSModule::create(expr->get());
+    }
     return nullptr;
 }