[Sync] Sync internal Gitlab

2022-02-18 11:30:27 +08:00 · 2022-02-18 11:30:27 +08:00 · 0c718e552b
parent c4d9566171
commit 0c718e552b
193 changed files with 9361 additions and 2733 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -24,9 +24,14 @@ add_definitions("-DMNN_VERSION_MAJOR=${MNN_VERSION_MAJOR}")
 add_definitions("-DMNN_VERSION_MINOR=${MNN_VERSION_MINOR}")
 add_definitions("-DMNN_VERSION_PATCH=${MNN_VERSION_PATCH}")

-# CMP0048 is related to letting CMake managing the package version for us
-
-cmake_policy(SET CMP0048 NEW)
+# Clear VERSION variables when no VERSION is given to project()
+if(POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif()
+# MSVC runtime library flags are selected by an abstraction.
+if(POLICY CMP0091)
+  cmake_policy(SET CMP0091 NEW)
+endif()
 project(MNN VERSION ${MNN_VERSION_MAJOR}.${MNN_VERSION_MINOR}.${MNN_VERSION_PATCH}.${MNN_VERSION_BUILD} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
@ -35,14 +40,6 @@ set(CMAKE_MODULE_PATH
  ${CMAKE_MODULE_PATH}
  "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
-#add_custom_command(OUTPUT "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
-#    COMMAND ${CMAKE_COMMAND} "-DNAMES=MNN"
-#    "-DMNN_SOURCE_DIR=${CMAKE_CURRENT_LIST_DIR}"
-#    "-DHEADER_FILE=${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h"
-#    -P "${CMAKE_CURRENT_LIST_DIR}/cmake/GenerateVersionFromVCS.cmake"
-#    COMMENT "Generating Version Control Info"
-#)
-#add_custom_target (GenVCSHDR DEPENDS "${CMAKE_CURRENT_LIST_DIR}/include/MNN/VCS.h")
 # Required for OpenCL/OpenGL/Vulkan CodeGen
 include(FindPythonInterp REQUIRED)
 # build options
@ -107,8 +104,8 @@ IF(WIN32)
    SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
    SET(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")

-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4819")
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275 /wd4101")
  ENDIF()
 ENDIF()

@ -118,13 +115,54 @@ IF( MNN_ENABLE_COVERAGE)
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fprofile-arcs -ftest-coverage")
 ENDIF()

+# do this before protobuf, make sure wincrt config of protobuf and MNN is same
+if(MSVC)
+    # same as protobuf, otherwise config is inconsistent
+    if(CMAKE_VERSION VERSION_GREATER 3.15 OR CMAKE_VERSION VERSION_EQUAL 3.15)
+      set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded$<$<CONFIG:Debug>:Debug>)
+      if(NOT MNN_WIN_RUNTIME_MT)
+        set(CMAKE_MSVC_RUNTIME_LIBRARY ${CMAKE_MSVC_RUNTIME_LIBRARY}DLL)
+      endif()
+    else()
+      foreach(flag_var
+          CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+          CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+          CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+          if (MNN_WIN_RUNTIME_MT)
+              if(${flag_var} MATCHES "/MD")
+                  string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+              endif()
+          else ()
+              if(${flag_var} MATCHES "/MT")
+                  string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+              endif()
+          endif ()
+      endforeach()
+    endif()
+    set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
+endif()
+
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)
 IF(MNN_BUILD_PROTOBUFFER)
 IF(MNN_BUILD_CONVERTER)
+  IF(MSVC)
+    set(protobuf_BUILD_SHARED_LIBS ${MNN_BUILD_SHARED_LIBS})
+    IF((NOT MNN_BUILD_SHARED_LIBS) AND (NOT MNN_WIN_RUNTIME_MT))
+      message(FATAL_ERROR "When MNN_BUILD_CONVERTER=ON and MNN_BUILD_SHARED_LIBS=OFF, MNN_WIN_RUNTIME_MT must be ON. Because protobuf not support the config(static /MD)")
+    ENDIF()
+  ENDIF()
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/3rd_party/protobuf/cmake)
 ENDIF()
 ENDIF()

+# specify source file encoding explicitly, fix cross-platform garbled output issue
+# we need do this after protobuf which set different execution-charset
+IF(MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /source-charset:utf-8")
+ENDIF()
+
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
@ -206,26 +244,7 @@ message(STATUS "\tThreadPool: ${MNN_USE_THREAD_POOL}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")

-if(MSVC)
-    if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
-      message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
-    endif()
-    foreach(flag_var
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if (MNN_WIN_RUNTIME_MT)
-            if(${flag_var} MATCHES "/MD")
-                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-            endif()
-        else ()
-            if(${flag_var} MATCHES "/MT")
-                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
-            endif()
-        endif ()
-    endforeach()
-elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
+if(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
    add_definitions(-fPIC)
 endif()
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
@ -561,6 +580,9 @@ if (MNN_INTERNAL)
    target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED)
    include(${CMAKE_CURRENT_LIST_DIR}/source/internal/auth/CMakeLists.txt)
    include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt)
+    if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
+        list(APPEND MNN_EXTRA_DEPENDS "-lcurl -lssl -lcrypto")
+    endif()
 endif()

 # Train
@ -661,7 +683,18 @@ if(APPLE)
 endif()
 add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCPU)
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)
+IF(WIN32 AND MNN_BUILD_CONVERTER AND MNN_BUILD_SHARED_LIBS)
+# Because of dllimport/dllexport, we merge MNN and MNNConvertDeps together, which depend protobuf
+  target_link_libraries(MNN PUBLIC ${Protobuf_LIBRARIES})
+ENDIF()
+# Merge MNN/MNNExpress/MNNOpenCV and other backends into one .lib/.dll on Windows
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/cv)
+IF(MNN_BUILD_OPENCV AND NOT MNN_SEP_BUILD)
+  IF(MSVC)
+    target_compile_definitions(MNNOpenCV PRIVATE "-DBUILDING_MNN_DLL" INTERFACE "-DUSING_MNN_DLL")
+  ENDIF()
+  target_sources(MNN PRIVATE $<TARGET_OBJECTS:MNNOpenCV>)
+ENDIF()

 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
 # Using -pthread, needed by thread-safe implemention of glibc, is better than only using -lpthread
@ -753,6 +786,10 @@ ELSE()
      ARCHIVE DESTINATION lib
      FRAMEWORK DESTINATION /Library/Frameworks/
  )
+  if (NOT MNN_AAPL_FMWK)
+      INSTALL(FILES ${MNN_PUB_HDRS} DESTINATION include/MNN/)
+      INSTALL(FILES ${MNN_EXPR_PUB_HDRS} DESTINATION include/MNN/expr/)
+  endif()
  FOREACH(HDR ${MNN_EXPR_PUB_HDRS})
    SET_SOURCE_FILES_PROPERTIES(${HDR} PROPERTIES MACOSX_PACKAGE_LOCATION Headers/expr/ )
  ENDFOREACH()
--- a/README.md
+++ b/README.md
@ -59,7 +59,17 @@ Interpreter consists of Engine and Backends. The former is responsible for the l

 Scan the following QR codes to join Dingtalk discussion group. The group discussions are predominantly Chinese. But we welcome and will help English speakers.

-See https://www.yuque.com/mnn/cn/feedback for dingtalk group barcodes.
+Group #1 (Full):
+
+<img src="doc/DingTalkQR1.png" height="256"/>
+
+Group #2 (Full):
+
+<img src="doc/DingTalkQR2.png" height="256"/>
+
+Group #3:
+
+<img src="doc/DingTalkQR3.png" height="256"/>

 ## License
 Apache 2.0
--- a/README_CN.md
+++ b/README_CN.md
@ -56,7 +56,19 @@ Converter由Frontends和Graph Optimize构成。前者负责支持不同的训练
 Interpreter由Engine和Backends构成。前者负责模型的加载、计算图的调度；后者包含各计算设备下的内存分配、Op实现。在Engine和Backends中，MNN应用了多种优化方案，包括在卷积和反卷积中应用Winograd算法、在矩阵乘法中应用Strassen算法、低精度计算、Neon优化、手写汇编、多线程优化、内存复用、异构计算等。

 ##  社区交流与反馈
-扫描二维码加入钉钉讨论群，见：https://www.yuque.com/mnn/cn/feedback
+扫描二维码加入钉钉讨论群。
+
+一群（已满）：
+
+<img src="doc/DingTalkQR1.png" height="256"/>
+
+二群（已满）：
+
+<img src="doc/DingTalkQR2.png" height="256"/>
+
+三群：
+
+<img src="doc/DingTalkQR3.png" height="256"/>

 ## License
 Apache 2.0
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@ -18,6 +18,10 @@ IF(MNN_SEP_BUILD)
        add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
    endif()
    target_link_libraries(MNN_Express MNN)
+    install(TARGETS MNN_Express
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib
+        )
 ELSE()
    add_library(MNN_Express OBJECT ${MNN_EXPR_SRCS})
 ENDIF()
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@ -536,6 +536,7 @@ ErrorCode Executor::ComputeCache::compute() {
    if (mShapeDirty) {
        auto code = resize();
        if (NO_ERROR != code) {
+            mShapeDirty = true;
            return code;
        }
    }
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -116,6 +116,9 @@ Variable::Info* Expr::outputInfo(int index) const {
 void Expr::_addLinkForInputs(EXPRP expr) {
    auto inputs = expr->inputs();
    for (int i=0; i<inputs.size(); ++i) {
+        if (inputs[i].get() == nullptr) {
+            continue;
+        }
        bool findEmpty = false;
        auto inputExpr = inputs[i]->mFrom;
        for (int j=0; j<inputExpr->mTo.size(); ++j) {
@ -290,6 +293,10 @@ bool Expr::requireInfo() {
    }
    for (int i = 0; i < mInputs.size(); ++i) {
        auto& v  = mInputs[i];
+        if (v->getInfo()->size == 0) {
+            // zero shape
+            continue;
+        }
        if (mInside->mReq.shapeNeedContent[i]) {
            // For shape need content, the content must not be nullptr
            auto ptr = v->readInternal(true);
@ -338,6 +345,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
        return;
    }
    for (auto input : old->inputs()) {
+        if (input.get() == nullptr) {
+            continue;
+        }
        for (int j=0; j<input->mFrom->mTo.size(); ++j) {
            auto ref = input->mFrom->mTo[j].lock();
            if (ref.get() == old.get()) {
@ -346,6 +356,9 @@ void Expr::replace(EXPRP old, EXPRP from) {
        }
    }
    for (auto input : from->inputs()) {
+        if (input.get() == nullptr) {
+            continue;
+        }
        bool hasSet = false;
        for (int j=0; j<input->mFrom->mTo.size(); ++j) {
            auto ref = input->mFrom->mTo[j].lock();
@ -567,6 +580,9 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
        return;
    }
    for (int i = 0; i < expr->inputs().size(); ++i) {
+        if (expr->inputs()[i].get() == nullptr) {
+            continue;
+        }
        visit(expr->inputs()[i]->mFrom, before, after);
    }
    after(expr);
@ -721,6 +737,9 @@ void Expr::visitOutputs(const std::function<bool(EXPRP, int)>& visit) {
        bool recurse = false;
        auto inputs = expr->inputs();
        for (int i=0; i<inputs.size(); ++i) {
+            if (inputs[i].get() == nullptr) {
+                continue;
+            }
            if (inputs[i]->mFrom.get() == this) {
                recurse = recurse || visit(expr, i);
            }
@ -924,6 +943,10 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        op->name = expr->name();
        op->inputIndexes.resize(expr->inputs().size());
        for (int i = 0; i < op->inputIndexes.size(); ++i) {
+            if (expr->inputs()[i] == nullptr) {
+                op->inputIndexes[i] = -1;
+                continue;
+            }
            auto inputExpr = expr->inputs()[i]->expr();
            op->inputIndexes[i] = varIndexInfo[inputExpr.first] + inputExpr.second;
        }
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -1119,6 +1119,14 @@ VARP _ScatterNd(VARP indices, VARP updates, VARP shape) {
    return (Variable::create(Expr::create(std::move(op), {indices, updates, shape})));
 }

+VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->main.type  = OpParameter_NONE;
+    op->type       = OpType_ScatterNd;
+    op->main.value = nullptr;
+    return (Variable::create(Expr::create(std::move(op), {indices, updates, shape, input})));
+}
+
 VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis) {
    std::unique_ptr<OpT> op(new OpT);
    op->type                       = OpType_OneHot;
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@ -581,6 +581,22 @@ VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided, int32_t begin
    op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
    return (Variable::create(Expr::create(op.get(), {input, begin, end, strided})));
 }
+
+VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write, int32_t beginMask,
+                        int32_t endMask, int32_t ellipsisMask, int32_t newAxisMask, int32_t shrinkAxisMask) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type                        = OpType_StridedSlice;
+    op->main.type                   = OpParameter_StridedSliceParam;
+    op->main.value                  = new StridedSliceParamT;
+
+    op->main.AsStridedSliceParam()->T = DataType_DT_FLOAT;
+    op->main.AsStridedSliceParam()->beginMask      = beginMask;
+    op->main.AsStridedSliceParam()->endMask        = endMask;
+    op->main.AsStridedSliceParam()->ellipsisMask   = ellipsisMask;
+    op->main.AsStridedSliceParam()->newAxisMask    = newAxisMask;
+    op->main.AsStridedSliceParam()->shrinkAxisMask = shrinkAxisMask;
+    return (Variable::create(Expr::create(op.get(), {input, begin, end, strided, write})));
+}
 /*Transposes x.
 Args:
 x: A variable.
@ -1830,5 +1846,57 @@ VARP _Where(VARP x) {
    return (Variable::create(Expr::create(std::move(op), {x})));
 }

+VARP _Sort(VARP x, int axis, bool arg, bool descend) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_TopKV2;
+    op->main.type = OpParameter_TopKV2;
+    auto topk = new TopKV2T;
+    topk->largest = descend;
+    op->main.value = topk;
+    auto shape = x->getInfo()->dim;
+    axis = axis < 0 ? shape.size() + axis : axis;
+    int k = x->getInfo()->dim[axis];
+    std::vector<VARP> inputs {x, _Scalar(k)};
+    if (axis + 1 != shape.size()) {
+        inputs.push_back(_Scalar(axis));
+    }
+    auto expr = Expr::create(op.get(), inputs, 2);
+    return Variable::create(expr, arg);
+}
+
+VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& region, const std::vector<int>& shape) {
+    std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+    op->type = OpType_Raster;
+    auto extra = new ExtraT;
+    // set shape
+    std::unique_ptr<AttributeT> shapeAttr(new AttributeT);
+    shapeAttr->key = "shape";
+    shapeAttr->list.reset(new ListValueT);
+    shapeAttr->list->i = shape;
+    extra->attr.push_back(std::move(shapeAttr));
+    // set region
+    std::unique_ptr<AttributeT> regionAttr(new AttributeT);
+    regionAttr->key = "region";
+    regionAttr->list.reset(new ListValueT);
+    regionAttr->list->i = region;
+    extra->attr.push_back(std::move(regionAttr));
+    op->main.type = OpParameter_Extra;
+    op->main.value = extra;
+    return (Variable::create(Expr::create(std::move(op), vars)));
+}
+
+VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold, float scoreThreshold) {
+    std::unique_ptr<MNN::OpT> op(new MNN::OpT);
+    op->type = OpType_NonMaxSuppressionV2;
+    std::vector<VARP> vars {boxes, scores, _Scalar(maxDetections)};
+    if (iouThreshold >= 0) {
+        vars.push_back(_Scalar(iouThreshold));
+    }
+    if (scoreThreshold >= 0) {
+        vars.push_back(_Scalar(scoreThreshold));
+    }
+    return (Variable::create(Expr::create(std::move(op), vars)));
+}
+
 } // namespace Express
 } // namespace MNN
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -166,7 +166,8 @@ public:
        return mModule->onForward(inputs);
    }
    virtual Module* clone(CloneContext* ctx) const override {
-        NetModule* module(new NetModule(mModule, mInfo));
+        std::shared_ptr<Module> submodule(mModule->clone(ctx));
+        NetModule* module(new NetModule(submodule, mInfo));
        return this->cloneBaseTo(ctx, module);
    }
    const Module::Info* info() const {
@ -223,9 +224,9 @@ static void _loadInputs(Module::Info* info, const std::vector<std::string>& inpu
    }
 }

-Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Module::Config* config) {
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> _rtMgr, const Module::Config* config) {
    // Check if runtime is valid
-    if (nullptr != rtMgr && rtMgr->getRuntimeInfo().first.empty()) {
+    if (nullptr != _rtMgr && _rtMgr->getRuntimeInfo().first.empty()) {
        MNN_ERROR("Invalid runtime\n");
        return nullptr;
    }
@ -269,6 +270,17 @@ Module* Module::load(const std::vector<std::string>& inputs, const std::vector<s
 #endif // MNN_INTERNAL_ENABLED

    std::shared_ptr<Info> info(new Info);
+    auto rtMgr = _rtMgr;
+    Module::Config defaultConfig;
+    if (nullptr == config) {
+        config = &defaultConfig;
+    }
+    if(nullptr == rtMgr && config->backend != nullptr) {
+        ScheduleConfig sche_config;
+        sche_config.type = config->backend->type;
+        sche_config.backendConfig = config->backend->config;
+        rtMgr.reset(Executor::RuntimeManager::createRuntimeManager(sche_config));
+    }
    if ((!inputs.empty()) && (!outputs.empty())) {
        _loadInputs(info.get(), inputs, net);
        info->runTimeManager = rtMgr;
--- a/express/module/NMSModule.hpp
+++ b/express/module/NMSModule.hpp
@ -16,7 +16,7 @@ public:
        // Do nothing
    }
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
-    static NMSModule* create(const Op* op);
+    MNN_PUBLIC static NMSModule* create(const Op* op);

 private:
    NMSModule(){}
--- a/include/MNN/ImageProcess.hpp
+++ b/include/MNN/ImageProcess.hpp
@ -61,6 +61,7 @@ public:

        /** edge wrapper */
        Wrap wrap = CLAMP_TO_EDGE;
+        bool draw = false;
    };

 public:
@ -148,6 +149,18 @@ public:
    void setPadding(uint8_t value) {
        mPaddingValue = value;
    }
+    /**
+     * @brief draw color to regions of img.
+     * @param img  the image to draw.
+     * @param w  the image's width.
+     * @param h  the image's height.
+     * @param c  the image's channel.
+     * @param regions  the regions to draw, size is [num * 3] contain num x { y, xl, xr }
+     * @param num  regions num
+     * @param color  the color to draw.
+     * @return void.
+     */
+    void draw(uint8_t* img, int w, int h, int c, const int* regions, int num, const uint8_t* color);
 private:
    ImageProcess(const Config& config);
    Matrix mTransform;
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -154,7 +154,7 @@ public:
     * @param keySize        depercerate, for future use.
     */
    void setCacheFile(const char* cacheFile, size_t keySize = 128);
-    
+
    /**
     * @brief The API shoud be called after last resize session.
     * If resize session generate new cache info, try to rewrite cache file.
@ -357,6 +357,12 @@ public:
     */
    const char* bizCode() const;

+    /**
+     * @brief get model UUID
+     * @return Model UUID.
+     */
+    const char* uuid() const;
+
 private:
    static Interpreter* createFromBufferInternal(Content* net);

--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -70,7 +70,7 @@ public:
        return mDebug.get();
    }
    struct Cache;
-    class RuntimeManager {
+    class MNN_PUBLIC RuntimeManager {
    public:
        ~RuntimeManager();
        /**
--- a/include/MNN/expr/MathOp.hpp
+++ b/include/MNN/expr/MathOp.hpp
@ -124,6 +124,7 @@ MNN_PUBLIC VARP _ArgMin(VARP input, int axis = 0);
 MNN_PUBLIC VARP _BatchMatMul(VARP x, VARP y, bool adj_x = false, bool adj_y = false);
 MNN_PUBLIC VARP _UnravelIndex(VARP indices, VARP dims);
 MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape);
+MNN_PUBLIC VARP _ScatterNd(VARP indices, VARP updates, VARP shape, VARP input);
 MNN_PUBLIC VARP _OneHot(VARP indices, VARP depth, VARP onValue, VARP offValue, int axis = -1);
 MNN_PUBLIC VARP _BroadcastTo(VARP a, VARP shape);
 MNN_PUBLIC VARP _LinSpace(VARP start, VARP stop, VARP num);
--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@ -63,8 +63,11 @@ MNN_PUBLIC VARP _Softsign(VARP features);
 MNN_PUBLIC std::vector<VARP> _Split(VARP value, INTS size_splits, int axis = 0);
 MNN_PUBLIC VARP _Slice(VARP x, VARP starts, VARP sizes);
 MNN_PUBLIC VARP _StridedSlice(VARP input, VARP begin, VARP end, VARP strided,
-                                      int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
-                                      int32_t newAxisMask, int32_t shrinkAxisMask);
+                              int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
+                              int32_t newAxisMask, int32_t shrinkAxisMask);
+MNN_PUBLIC VARP _StridedSliceWrite(VARP input, VARP begin, VARP end, VARP strided, VARP write,
+                                   int32_t beginMask, int32_t endMask, int32_t ellipsisMask,
+                                   int32_t newAxisMask, int32_t shrinkAxisMask);
 MNN_PUBLIC VARP _Concat(VARPS values, int axis);
 MNN_PUBLIC VARP _Convert(VARP input, Dimensionformat format);
 MNN_PUBLIC VARP _Transpose(VARP x, INTS perm);
@ -155,6 +158,9 @@ MNN_PUBLIC VARP _Select(VARP select, VARP input0, VARP input1);
 MNN_PUBLIC std::vector<VARP> _TopKV2(VARP input0, VARP input1);
 MNN_PUBLIC VARP _ImageProcess(VARP input, CV::ImageProcess::Config config, CV::Matrix matrix, int oh, int ow, int oc, int dtype, uint8_t padVal = 0);
 MNN_PUBLIC VARP _Where(VARP x);
+MNN_PUBLIC VARP _Sort(VARP x, int axis = -1, bool arg = false, bool descend = false);
+MNN_PUBLIC VARP _Raster(const std::vector<VARP>& vars, const std::vector<int>& regions, const std::vector<int>& shape);
+MNN_PUBLIC VARP _Nms(VARP boxes, VARP scores, int maxDetections, float iouThreshold = -1, float scoreThreshold = -1);

 } // namespace Express
 } // namespace MNN
--- a/package_scripts/linux/build_whl.sh
+++ b/package_scripts/linux/build_whl.sh
@ -21,13 +21,13 @@ done
 rm -rf $path && mkdir -p $path
 PACKAGE_PATH=$(realpath $path)

-CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON"
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_SEP_BUILD=OFF -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
 if [ ! -z $opencl ]; then
    CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
 fi
 rm -rf pymnn_build && mkdir pymnn_build
 pushd pymnn_build
-cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j24
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j24
 popd

 pushd pymnn/pip_package
--- a/package_scripts/mac/build_whl.sh
+++ b/package_scripts/mac/build_whl.sh
@ -19,25 +19,27 @@ while getopts "o:p:v:b" opt; do
  esac
 done

+export MACOSX_DEPLOYMENT_TARGET=10.11
+
 ./schema/generate.sh
 rm -rf $path && mkdir -p $path
 PACKAGE_PATH=$(realpath $path)

-CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON"
+CMAKE_ARGS="-DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON"
 if [ ! -z $opencl ]; then
    CMAKE_ARGS="$CMAKE_ARGS -DMNN_OPENCL=ON"
 fi

 rm -rf pymnn_build && mkdir pymnn_build
 pushd pymnn_build
-cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert -j8
+cmake $CMAKE_ARGS .. && make MNN MNNTrain MNNConvert MNNOpenCV -j8
 popd

 pushd pymnn/pip_package
 echo -e "__version__ = '$mnn_version'" > MNN/version.py
 rm -rf build && mkdir build
 rm -rf dist && mkdir dist
-if [ -z $python_versions ]; then
+if [ -z "$python_versions" ]; then
  python build_wheel.py --version $mnn_version
 else
  for env in $python_versions; do
--- a/package_scripts/win/build_bridge.ps1
+++ b/package_scripts/win/build_bridge.ps1
@ -1,66 +1,63 @@
 # MNNPyBridge
-#  |-- Debug
-#  |     |--- MD
-#  |     |--- MT
-#  |     |--- Static
-#  |
-#  |-- Release
-#        |--- MD
-#        |--- MT
-#        |--- Static
+#   |-- include
+#   |-- wrapper
+#   |-- test (Release + Dynamic + MD)
+#        |-- x64
+#        |-- x86
+#   |-- lib
+#        |-- x64
+#        |    |-- (Debug/Release x Dynamic/Static x MD/MT)
+#        |
+#        |-- x86
+#             |-- (Debug/Release x Dynamic/Static x MD/MT)

 Param(
    [Parameter(Mandatory=$true)][String]$version,
    [Parameter(Mandatory=$true)][String]$pyc_env,
    [Parameter(Mandatory=$true)][String]$mnn_path,
+    [Parameter(Mandatory=$true)][String]$python_path,
+    [Parameter(Mandatory=$true)][String]$numpy_path,
    [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$train_api,
    [Switch]$x86
 )

-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
 }

 $erroractionpreference = "stop"
+mkdir -p $path -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
-$PACKAGE_LIB_PATH = "$PACKAGE_PATH\lib"
-if ($x86) {
-    $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x86"
-} else {
-    $PACKAGE_LIB_PATH = "$PACKAGE_LIB_PATH\x64"
-}
-$MNN_PACKAGE_PATH = $(Resolve-Path $mnn_path).Path
-
-pushd pymnn\3rd_party
-Remove-Item MNN -Recurse -ErrorAction Ignore
-mkdir -p MNN\lib
-cp -r $MNN_PACKAGE_PATH\* MNN\lib
-cp -r ..\..\include MNN
-popd
+$arch = $(If($x86) {"x86"} Else {"x64"})
+$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$arch"
+$TEST_TOOL_PATH = "$PACKAGE_PATH/test/$arch"

 #clear and create package directory
 powershell ./schema/generate.ps1
 pushd $PACKAGE_PATH
-Remove-Item include -Recurse -ErrorAction Ignore
-Remove-Item wrapper -Recurse -ErrorAction Ignore
-mkdir -p include
-mkdir -p wrapper
-mkdir -p $PACKAGE_LIB_PATH\Debug\MD -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Debug\MT -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Debug\Static -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\MD -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\MT -ErrorAction SilentlyContinue
-mkdir -p $PACKAGE_LIB_PATH\Release\Static -ErrorAction SilentlyContinue
+Remove-Item -Path include, wrapper -Recurse -ErrorAction Ignore
+mkdir -p include, wrapper
+popd
+Remove-Item -Path $PACKAGE_LIB_PATH, $TEST_TOOL_PATH -Recurse -ErrorAction Ignore
+mkdir -p $PACKAGE_LIB_PATH, $TEST_TOOL_PATH
+pushd $PACKAGE_LIB_PATH
+mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
 popd

 # assume $PACKAGE_PATH exist
@ -71,8 +68,16 @@ cp -r pymnn\pip_package\MNN pymnn_pyc_tmp
 pushd pymnn_pyc_tmp
 Remove-Item MNN -Include __pycache__ -Recurse
 pushd MNN
-rm -r -force tools
-(Get-Content __init__.py).replace('from . import tools', '') | Set-Content __init__.py
+function Remove([String]$module) {
+  rm -r -force $module
+  (Get-Content __init__.py).replace("from . import $module", "") | Set-Content __init__.py
+}
+Remove "tools"
+if (!$train_api) {
+  Remove "data"
+  Remove "optim"
+}
+
 popd
 popd
 conda activate $pyc_env
@ -83,59 +88,108 @@ Set-Content -Path pymnn_pyc_tmp\version.py -Value "__version__ = '$version'"
 cp -r .\pymnn_pyc_tmp\* $PACKAGE_PATH\wrapper -Force
 rm -r -force pymnn_pyc_tmp

-$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_TRAIN_API=ON"
+$mnn_path = $(Resolve-Path $mnn_path).Path
+$python_path = $(Resolve-Path $python_path).Path
+$numpy_path = $(Resolve-Path $numpy_path).Path
+
+$CMAKE_ARGS = "-DPYMNN_USE_ALINNPYTHON=ON -DPYMNN_RUNTIME_CHECK_VM=ON -DPYMNN_EXPR_API=ON -DPYMNN_NUMPY_USABLE=ON -DPYMNN_BUILD_TEST=OFF"
+if ($train_api) {
+  $CMAKE_ARGS = "$CMAKE_ARGS -DPYMNN_TRAIN_API=ON"
+}
+$CMAKE_ARGS = "$CMAKE_ARGS -Dmnn_path=$mnn_path -Dpython_path=$python_path -Dnumpy_path=$numpy_path"

 Remove-Item pymnn_build -Recurse -ErrorAction Ignore
 mkdir pymnn_build
 pushd pymnn_build

-##### Debug/MT ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MT
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MT
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
-#rm mnnpybridge.*
+function exist([String]$build_type, [String]$lib_type, [String]$crt_type) {
+  function _exist([String]$lib) {
+    $lib_dir = "$lib/lib/$arch/$build_type/$lib_type/$crt_type"
+    return $((Test-Path -Path $lib_dir) -and ((Get-ChildItem -Path "$lib_dir/*" -Include "*.lib").Count -ne 0))
+  }
+  return $((_exist $mnn_path) -and (_exist $python_path) -and (_exist $numpy_path))
+}

-##### Debug/MD ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\MD
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Debug\MD
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
-#rm mnnpybridge.*
+function log([String]$msg) {
+    echo "================================"
+    echo "Build MNNPyBridge $msg"
+    echo "================================"
+}

-##### Debug/Static ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static
-#rm mnnpybridge.*
+##### Debug/Dynamic/MT ####
+if (exist Debug Dynamic MT) {
+  log "Debug/Dynamic/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MT
+  rm mnnpybridge.*
+}

-##### Release/MT ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MT
-#cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MT
-#cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MT
-#rm mnnpybridge.*
+##### Debug/Dynamic/MD ####
+if (exist Debug Dynamic MD) {
+  log "Debug/Dynamic/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Debug\MD
+  rm mnnpybridge.*
+}

-##### Release/MD ####
-Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF ../pymnn"
-Retry "ninja" 2
-cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\MD
-cp mnnpybridge.dll $PACKAGE_LIB_PATH\Release\MD
-cp mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\MD
-rm mnnpybridge.*
+##### Debug/Static/MT ####
+if (exist Debug Static MT) {
+  log "Debug/Static/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MT
+  rm mnnpybridge.*
+}

-##### Release/Static ####
-#Remove-Item CMakeCache.txt -ErrorAction Ignore
-#Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
-#Retry "ninja" 2
-#cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static
+##### Debug/Static/MD ####
+if (exist Debug Static MD) {
+  log "Debug/Static/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Debug\Static\MD
+  rm mnnpybridge.*
+}
+
+##### Release/Dynamic/MT ####
+if (exist Release Dynamic MT) {
+  log "Release + MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
+  rm mnnpybridge.*
+}
+
+##### Release/Dynamic/MD ####
+if (exist Release Dynamic MD) {
+  log "Release + MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release ../pymnn"
+  cp mnnpybridge.lib,mnnpybridge.dll,mnnpybridge.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
+  #cp mnnpybridge_test.exe $TEST_TOOL_PATH
+  #cp $mnn_path/lib/$arch/Release/MD/MNN.dll $TEST_TOOL_PATH
+  #cp $python_path/lib/$arch/Release/MD/python.dll $TEST_TOOL_PATH
+  #cp $numpy_path/lib/$arch/Release/MD/numpy_python.dll $TEST_TOOL_PATH
+  rm mnnpybridge.*
+}
+
+##### Release/Static/MT ####
+if (exist Release Static MT) {
+  log "Release/Static/MT"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MT
+  rm mnnpybridge.*
+}
+
+##### Release/Static/MD ####
+if (exist Release Static MD) {
+  log "Release/Static/MD"
+  Remove-Item CMakeCache.txt -ErrorAction Ignore
+  Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF ../pymnn"
+  cp mnnpybridge.lib $PACKAGE_LIB_PATH\Release\Static\MD
+  rm mnnpybridge.*
+}

 popd
--- a/package_scripts/win/build_lib.ps1
+++ b/package_scripts/win/build_lib.ps1
@ -1,49 +1,47 @@
 # MNN
-#  |-- Debug
-#  |     |--- MD
-#  |     |--- MT
-#  |     |--- Static
-#  |
-#  |-- Release
-#        |--- MD
-#        |--- MT
-#        |--- Static
+#  |-- include
+#  |-- lib
+#       |-- Debug
+#       |     |--- Dynamic
+#       |     |      |--- MD
+#       |     |      |--- MT
+#       |     |
+#       |     |--- Static
+#       |            |--- MD
+#       |            |--- MT
+#       |
+#       |-- Release
+#             |--- Dynamic
+#             |      |--- MD
+#             |      |--- MT
+#             |
+#             |--- Static
+#                    |--- MD
+#                    |--- MT
+#
 Param(
    [Parameter(Mandatory=$true)][String]$path,
-    [String]$backends
+    [String]$backends,
+    [Switch]$x86
 )

-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
-}
-
 $erroractionpreference = "stop"
-Remove-Item $path -Recurse -ErrorAction Ignore
-mkdir -p $path
+New-Item -Path $path -ItemType Directory -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
+$PACKAGE_LIB_PATH = "$PACKAGE_PATH/lib/$(If ($x86) {"x86"} Else {"x64"})"
+Remove-Item -Path $PACKAGE_LIB_PATH -Recurse -ErrorAction Ignore
+mkdir -p $PACKAGE_LIB_PATH

 #clear and create package directory
 powershell ./schema/generate.ps1
-pushd $PACKAGE_PATH
-mkdir -p Debug\MD
-mkdir -p Debug\MT
-mkdir -p Debug\Static
-mkdir -p Release\MD
-mkdir -p Release\MT
-mkdir -p Release\Static
+Remove-Item -Path $PACKAGE_PATH/include -Recurse -ErrorAction Ignore
+cp -r include $PACKAGE_PATH
+cp -r tools/cv/include/cv $PACKAGE_PATH/include
+pushd $PACKAGE_LIB_PATH
+mkdir -p Debug\Dynamic\MD, Debug\Dynamic\MT, Debug\Static\MD, Debug\Static\MT, Release\Dynamic\MD, Release\Dynamic\MT, Release\Static\MD, Release\Static\MT
 popd

-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON"
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
 if ($backends -ne $null) {
    Foreach ($backend in $backends.Split(",")) {
        if ($backend -eq "opencl") {
@ -58,53 +56,83 @@ Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
 pushd build

-##### Debug/MT ####
+function log([String]$msg) {
+    echo "================================"
+    echo "Build MNN (CPU $backends) $msg"
+    echo "================================"
+}
+
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja MNN") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
+}
+
+##### Debug/Dynamic/MT ####
+log "Debug/Dynamic/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\MT
-cp MNN.dll $PACKAGE_PATH\Debug\MT
-cp MNN.pdb $PACKAGE_PATH\Debug\MT
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MT
 rm MNN.*

-##### Debug/MD ####
+##### Debug/Dynamic/MD ####
+log "Debug/Dynamic/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\MD
-cp MNN.dll $PACKAGE_PATH\Debug\MD
-cp MNN.pdb $PACKAGE_PATH\Debug\MD
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Debug\Dynamic\MD
 rm MNN.*

-##### Debug/Static ####
+##### Debug/Static/MT ####
+log "Debug/Static/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Debug\Static
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MT
 rm MNN.*

-##### Release/MT ####
+##### Debug/Static/MD ####
+log "Debug/Static/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\MT
-cp MNN.dll $PACKAGE_PATH\Release\MT
-cp MNN.pdb $PACKAGE_PATH\Release\MT
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Debug\Static\MD
 rm MNN.*

-##### Release/MD ####
+##### Release/Dynamic/MT ####
+log "Release/Dynamic/MT"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\MD
-cp MNN.dll $PACKAGE_PATH\Release\MD
-cp MNN.pdb $PACKAGE_PATH\Release\MD
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MT
 rm MNN.*

-##### Release/Static ####
+##### Release/Dynamic/MD ####
+log "Release/Dynamic/MD"
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
-Retry "ninja" 2
-cp MNN.lib $PACKAGE_PATH\Release\Static
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF .."
+cp MNN.lib, MNN.dll, MNN.pdb $PACKAGE_LIB_PATH\Release\Dynamic\MD
+rm MNN.*
+
+##### Release/Static/MT ####
+log "Release/Static/MT"
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MT
+
+##### Release/Static/MD ####
+log "Release/Static/MD"
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+Build "cmake -G Ninja $CMAKE_ARGS -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_BUILD_SHARED_LIBS=OFF .."
+cp MNN.lib $PACKAGE_LIB_PATH\Release\Static\MD

 popd
--- a/package_scripts/win/build_tools.ps1
+++ b/package_scripts/win/build_tools.ps1
@ -1,5 +1,6 @@
 Param(
    [Parameter(Mandatory=$true)][String]$path,
+    [Switch]$dynamic_link,
    [String]$backends,
    [Switch]$build_all,
    [Switch]$build_train, # MNN_BUILD_TRAIN
@ -23,20 +24,6 @@ if ($build_all) {
    $build_demo = $true
 }

-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
-}
-
 $erroractionpreference = "stop"
 Remove-Item $path -Recurse -ErrorAction Ignore
 mkdir -p $path
@ -44,7 +31,12 @@ $TOOLS_PATH = $(Resolve-Path $path).Path

 powershell ./schema/generate.ps1

-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_SHARED_LIBS=OFF"
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
+if ($dynamic_link) {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=ON"
+} else {
+    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON"
+}
 if ($build_train) {
    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_TRAIN=ON"
 }
@ -59,6 +51,11 @@ if ($build_evaluation) {
 }
 if ($build_converter) {
    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_CONVERTER=ON"
+    if ($dynamic_link) {
+        $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=ON"
+    } else {
+        $CMAKE_ARGS = "$CMAKE_ARGS -Dprotobuf_BUILD_SHARED_LIBS=OFF"
+    }
 }
 if ($build_benchmark) {
    $CMAKE_ARGS = "$CMAKE_ARGS -DMNN_BUILD_BENCHMARK=ON"
@ -83,37 +80,37 @@ Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
 pushd build

+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    popd
+    exit 1
+}
+
 Remove-Item CMakeCache.txt -ErrorAction Ignore
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS  .."
-Retry "ninja" 2
+Build "cmake -G Ninja $CMAKE_ARGS  .."

-$PRODUCTS = ""
-if ($build_train) {
-    $PRODUCTS = "$PRODUCTS transformer.out.exe train.out.exe rawDataTransform.out.exe dataTransformer.out.exe runTrainDemo.out.exe"
-}
-if ($build_tools) {
-    $PRODUCTS = "$PRODUCTS MNNV2Basic.out.exe mobilenetTest.out.exe backendTest.out.exe testModel.out.exe testModelWithDescrisbe.out.exe getPerformance.out.exe checkInvalidValue.out.exe timeProfile.out.exe"
-}
-if ($build_quantools) {
-    $PRODUCTS = "$PRODUCTS quantized.out.exe quantized_model_optimize.out.exe"
-}
-if ($build_evaluation) {
-    $PRODUCTS = "$PRODUCTS classficationTopkEval.out.exe"
-}
-if ($build_converter) {
-    $PRODUCTS = "$PRODUCTS MNNDump2Json.exe MNNConvert.exe"
-}
-if ($build_benchmark) {
-    $PRODUCTS = "$PRODUCTS benchmark.out.exe benchmarkExprModels.out.exe"
-}
-if ($build_test) {
-    $PRODUCTS = "$PRODUCTS run_test.out.exe"
-}
-if ($build_demo) {
-    $PRODUCTS = "$PRODUCTS pictureRecognition.out.exe pictureRotate.out.exe multiPose.out.exe segment.out.exe expressDemo.out.exe transformerDemo.out.exe rasterDemo.out.exe"
+$PRODUCTS = $(Get-ChildItem -Path . -Include "*.exe" -Name)
+if ($dynamic_link) {
+    $PRODUCTS = "$PRODUCTS MNN.dll"
+    if ($build_converter) {
+        $PRODUCTS = "$PRODUCTS ./3rd_party/protobuf/cmake/libprotobuf.dll"
+    }
 }

-Foreach ($PRODUCT in $PRODUCTS.Split(" ")) {
+Foreach ($PRODUCT in $PRODUCTS.Trim().Split()) {
    Invoke-Expression "cp $PRODUCT $TOOLS_PATH"
 }

--- a/package_scripts/win/build_whl.ps1
+++ b/package_scripts/win/build_whl.ps1
@ -6,25 +6,28 @@ Param(
    [Switch]$x86
 )

-# build process may failed because of lnk1181, but be success when run again
-# Run expr, return if success, otherwise try again until try_times
-function Retry([String]$expr, [Int]$try_times) {
-  $cnt = 0
-  do {
-   $cnt++
-   try {
-     Invoke-Expression $expr
-     return
-   } catch { }
- } while($cnt -lt $try_times)
- throw "Failed: $expr"
+# build it according to cmake_cmd, exit 1 when any error occur
+function Build([String]$cmake_cmd, [String]$ninja_cmd = "ninja") {
+    Invoke-Expression $cmake_cmd
+    # build process may failed because of lnk1181, but be success when run again
+    $try_times = 2
+    if ($LastExitCode -eq 0) {
+        For ($cnt = 0; $cnt -lt $try_times; $cnt++) {
+            try {
+                Invoke-Expression $ninja_cmd
+                if ($LastExitCode -eq 0) {
+                    return
+                }
+            } catch {}
+        }
+    }
+    exit 1
 }

 $erroractionpreference = "stop"
 $python_versions = $pyenvs.Split(",")

-Remove-Item $path -Recurse -ErrorAction Ignore
-mkdir -p $path
+New-Item -Path $path -ItemType Directory -ErrorAction Ignore
 $PACKAGE_PATH = $(Resolve-Path $path).Path
 $ARGS = "--version $version"
 if ($x86) {
@ -37,7 +40,7 @@ powershell ./schema/generate.ps1
 Remove-Item pymnn_build -Recurse -ErrorAction Ignore
 mkdir pymnn_build
 pushd pymnn_build
-$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON "
+$CMAKE_ARGS = "-DMNN_SEP_BUILD=OFF -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON -DMNN_AVX512=ON"
 if ($backends -ne $null) {
    Foreach($backend in $backends.Split(",")) {
        if ($backend -eq "opencl") {
@ -47,8 +50,7 @@ if ($backends -ne $null) {
        }
    }
 }
-Invoke-Expression "cmake -G Ninja $CMAKE_ARGS .."
-Retry "ninja MNN MNNTrain MNNConvert" 2
+Build "cmake -G Ninja $CMAKE_ARGS .." "ninja MNN MNNTrain MNNConvert MNNOpenCV"
 popd

 pushd pymnn/pip_package
@ -59,12 +61,15 @@ mkdir dist
 mkdir build

 if ($pyenvs -eq $null) {
-    Retry "python build_wheel.py $ARGS" 2
+    Invoke-Expression "python build_wheel.py $ARGS"
 } else {
    Foreach ($env in $pyenvs.Split(",")) {
        Invoke-Expression "conda activate $env"
-        Retry "python build_wheel.py $ARGS" 2
-        Invoke-Expression "conda deactivate"
+        Invoke-Expression "python build_wheel.py $ARGS"
+        conda deactivate
+        if ($LastExitCode -ne 0) {
+            exit 1
+        }
    }
 }

--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@ -748,6 +748,7 @@
 		EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EBECA38924643D310062C7A3 /* Arm82Backend.cpp */; };
 		EBECA3A124643D4E0062C7A3 /* MNNAsmGlobal.h in Headers */ = {isa = PBXBuildFile; fileRef = EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */; };
 		EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */ = {isa = PBXBuildFile; fileRef = EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
+		F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */ = {isa = PBXBuildFile; fileRef = F41497D6278D8A21004A363A /* RuntimeAttr.hpp */; };
 		F4FB5AD7274E6CC100EAF0C1 /* MNNAESCipher.h in Headers */ = {isa = PBXBuildFile; fileRef = F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */; };
 		F4FB5AD8274E6CC100EAF0C1 /* ModelAuth.mm in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */; };
 		F4FB5AD9274E6CC100EAF0C1 /* MNNAESCipher.m in Sources */ = {isa = PBXBuildFile; fileRef = F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */; };
@ -1542,6 +1543,7 @@
 		EBECA38924643D310062C7A3 /* Arm82Backend.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Arm82Backend.cpp; path = ../arm82/Arm82Backend.cpp; sourceTree = "<group>"; };
 		EBECA3A024643D4E0062C7A3 /* MNNAsmGlobal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = MNNAsmGlobal.h; path = ../arm82/asm/MNNAsmGlobal.h; sourceTree = "<group>"; };
 		EBECA3A324643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNQuantizeFP16_UNIT4.S; path = ../arm82/asm/arm64/MNNQuantizeFP16_UNIT4.S; sourceTree = "<group>"; };
+		F41497D6278D8A21004A363A /* RuntimeAttr.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = RuntimeAttr.hpp; sourceTree = "<group>"; };
 		F4FB5AAF274E6CC100EAF0C1 /* MNNAESCipher.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNNAESCipher.h; sourceTree = "<group>"; };
 		F4FB5AB0274E6CC100EAF0C1 /* ModelAuth.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ModelAuth.mm; sourceTree = "<group>"; };
 		F4FB5AB1274E6CC100EAF0C1 /* MNNAESCipher.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = MNNAESCipher.m; sourceTree = "<group>"; };
@ -1679,6 +1681,7 @@
 		48593FB423A89B2F0069452A /* express */ = {
 			isa = PBXGroup;
 			children = (
+				F41497D6278D8A21004A363A /* RuntimeAttr.hpp */,
 				489D7AC42550FF9F00AD896A /* ExecutorScope.cpp */,
 				48C84B6F250F711600EE7666 /* module */,
 				48FA474C23AA136300172C3B /* MergeOptimizer.cpp */,
@ -2951,6 +2954,7 @@
 				92FF04B423AA0BFB00AC97F6 /* MNNMemoryUtils.h in Headers */,
 				4D9A937426255BDA00F9B43C /* CoreMLReduction.hpp in Headers */,
 				48C84B8B250F711700EE7666 /* PipelineModule.hpp in Headers */,
+				F41497D7278D8A21004A363A /* RuntimeAttr.hpp in Headers */,
 				92FF04C123AA0BFB00AC97F6 /* Backend.hpp in Headers */,
 				489D7A812550FDC900AD896A /* MetalPooling.hpp in Headers */,
 				92FF02A623AA0B5A00AC97F6 /* CPUQuantizedMaxPool.hpp in Headers */,
@ -2985,6 +2989,7 @@
 			buildConfigurationList = 0F1465BF1FA18D1000F9860A /* Build configuration list for PBXNativeTarget "MNN" */;
 			buildPhases = (
 				0F1465B41FA18D1000F9860A /* Headers */,
+				F48DED4627742886004B8DB0 /* ShellScript */,
 				0F1465B21FA18D1000F9860A /* Sources */,
 				0F1465B31FA18D1000F9860A /* Frameworks */,
 				0F1465B51FA18D1000F9860A /* Resources */,
@ -3091,6 +3096,23 @@
 			shellPath = /bin/sh;
 			shellScript = "\necho \"==========\"\necho ${TARGET_NAME}\necho ${PROJECT_FILE_PATH}\necho ${TARGET_BUILD_DIR}\n\ntouch ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib\ncp ${TARGET_BUILD_DIR}/MNN.framework/mnn.metallib ${TARGET_BUILD_DIR}/Playground.app/\n";
 		};
+		F48DED4627742886004B8DB0 /* ShellScript */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+			);
+			outputFileListPaths = (
+			);
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "# Type a script or drag a script file from your workspace to insert its path.\nMNN_REVISION=`git rev-parse HEAD`\necho \"#define MNN_REVISION \\\"${MNN_REVISION}\\\"\" > ${SRCROOT}/../../include/MNN/VCS.h\n";
+		};
 /* End PBXShellScriptBuildPhase section */

 /* Begin PBXSourcesBuildPhase section */
@ -3808,7 +3830,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = 6T3QR3X696;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3854,7 +3876,7 @@
 				METAL_LIBRARY_FILE_BASE = mnn;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
+				PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -3875,7 +3897,7 @@
 				CODE_SIGN_STYLE = Automatic;
 				DEAD_CODE_STRIPPING = YES;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@ -3919,7 +3941,7 @@
 				MACH_O_TYPE = staticlib;
 				METAL_LIBRARY_FILE_BASE = mnn;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.zhaodewang.MNN.yyavdsavds;
+				PRODUCT_BUNDLE_IDENTIFIER = com.alibaba.MNN;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@ -3938,7 +3960,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
@ -3963,7 +3985,7 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_LAUNCHIMAGE_NAME = LaunchImage;
 				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = 6G7464HHUS;
+				DEVELOPMENT_TEAM = UMNWSVYR5X;
 				GCC_ENABLE_CPP_EXCEPTIONS = NO;
 				GCC_ENABLE_CPP_RTTI = NO;
 				HEADER_SEARCH_PATHS = (
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@ -9,37 +9,50 @@
 #import "AppDelegate.h"
 #import "MNNTestSuite.h"
 #include <MNN/MNNForwardType.h>
+#include <MNN/Interpreter.hpp>
 #import <MNN/expr/Executor.hpp>
 #import "benchmark.h"

@implementation AppDelegate

 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-#define UNITTEST
-#ifdef UNITTEST
-    // unittest
-    {
-        MNN::BackendConfig config;
-        // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
-        MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
-        int precisionInTestUtil =
-        getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
-        MNNTestSuite::runAll(precisionInTestUtil);
-    }
-#endif
-#ifdef BENCHMARK
-    // benchmark
-    {
-        auto bundle = CFBundleGetMainBundle();
-        auto url    = CFBundleCopyBundleURL(bundle);
-        auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
-        CFRelease(url);
-        auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
-        auto res     = std::string(cstring) + "/models";
-        CFRelease(string);
-        iosBenchAll(res.c_str());
-    }
-#endif
+//#define UNITTEST
+//#ifdef UNITTEST
+//    // unittest
+//    {
+//        MNN::BackendConfig config;
+//        // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
+//        MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
+//        int precisionInTestUtil =
+//        getTestPrecision(MNN_FORWARD_CPU, config.precision, MNN::Express::Executor::getGlobalExecutor()->getCurrentRuntimeStatus(MNN::STATUS_SUPPORT_FP16));
+//        MNNTestSuite::runAll(precisionInTestUtil);
+//    }
+//#endif
+//#ifdef BENCHMARK
+//    // benchmark
+//    {
+//        auto bundle = CFBundleGetMainBundle();
+//        auto url    = CFBundleCopyBundleURL(bundle);
+//        auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
+//        CFRelease(url);
+//        auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
+//        auto res     = std::string(cstring) + "/models";
+//        CFRelease(string);
+//        iosBenchAll(res.c_str());
+//    }
+//#endif
+    auto bundle = CFBundleGetMainBundle();
+    auto url    = CFBundleCopyBundleURL(bundle);
+    auto string = CFURLCopyFileSystemPath(url, kCFURLPOSIXPathStyle);
+    CFRelease(url);
+    auto cstring = CFStringGetCStringPtr(string, kCFStringEncodingUTF8);
+    auto res     = std::string(cstring) + "/models/mobilenet_v2_auth.mnn";
+    
+    
+    MNN::Interpreter* interpreter = MNN::Interpreter::createFromFile(res.c_str());
+    MNN::ScheduleConfig config;
+    interpreter->createSession(config);
+    
    return YES;
 }

--- a/pymnn/CMakeLists.txt
+++ b/pymnn/CMakeLists.txt
@ -3,6 +3,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 project(mnnpybridge)

+# python_path / numpy_path / mnn_path
 option(DEPEND_AAPL_FMWK "use dependency library .framework instead of traditional .a/.dylib" OFF)
 option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
 option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
@ -12,8 +13,17 @@ option(PYMNN_NEW_PYTHON "AliNNPython new version (when PYMNN_RUNTIME_CHECK_VM=OF
 option(PYMNN_EXPR_API "MNN expr API be exposed" ON)
 option(PYMNN_NUMPY_USABLE "Build based on numpy" ON)
 option(PYMNN_TRAIN_API "MNN train API be exposed" OFF)
+option(PYMNN_INTERNAL_SERVING "Internal use only." OFF)
+
+if(PYMNN_INTERNAL_SERVING)
+    file(GLOB_RECURSE SRC   ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/monitor_service.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/verify_service.cc
+            ${CMAKE_CURRENT_LIST_DIR}/src/internal/http_util.cc)
+else()
+    file(GLOB_RECURSE SRC   ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
+endif()

-file(GLOB_RECURSE SRC ${CMAKE_CURRENT_LIST_DIR}/src/MNN.cc)
 if (MNN_BUILD_SHARED_LIBS)
    add_library(mnnpybridge SHARED ${SRC})
 else()
@ -39,6 +49,11 @@ if(PYMNN_TRAIN_API)
    target_compile_definitions(mnnpybridge PRIVATE PYMNN_TRAIN_API)
 endif()

+if(PYMNN_INTERNAL_SERVING)
+    message(STATUS "mnnpybridge define PYMNN_INTERNAL_SERVING")
+    target_compile_definitions(mnnpybridge PRIVATE PYMNN_INTERNAL_SERVING)
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
    add_definitions(-DMNN_USE_LOGCAT)
 endif()
@ -59,8 +74,8 @@ if(MSVC)
            endif()
        endif ()
    endforeach()
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267")
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4005 /wd4267 /experimental:preprocessor")
    SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
    SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
    SET(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -73,20 +88,24 @@ endif()
 if(PYMNN_TRAIN_API)
    set(MNN_DIR ${CMAKE_CURRENT_LIST_DIR}/..)
    target_include_directories(mnnpybridge PRIVATE
-        ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer
-        ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include)
+        ${MNN_DIR}/tools/train/source/grad ${MNN_DIR}/tools/train/source/optimizer ${MNN_DIR}/tools/train/source/transformer ${MNN_DIR}/tools/train/source/nn
+        ${MNN_DIR}/tools/train/source/data ${MNN_DIR}/schema/current ${MNN_DIR}/3rd_party/flatbuffers/include ${MNN_DIR}/tools/cv/include
+        ${MNN_DIR}/express ${MNN_DIR}/express/module ${MNN_DIR}/tools)
 endif()

 if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
    set(DEPEND_PATH "${CMAKE_CURRENT_LIST_DIR}/3rd_party")
    set(LIB_SUBPATH "")
    if(WIN32)
-        if(NOT MNN_BUILD_SHARED_LIBS)
-            set(LIB_SUBPATH "Static")
-        elseif(MNN_WIN_RUNTIME_MT)
-            set(LIB_SUBPATH "MT")
+        if (MNN_BUILD_SHARED_LIBS)
+            set(LIB_SUBPATH "Dynamic")
        else()
-            set(LIB_SUBPATH "MD")
+            set(LIB_SUBPATH "Static")
+        endif()
+        if (MNN_WIN_RUNTIME_MT)
+            set(LIB_SUBPATH "${LIB_SUBPATH}/MT")
+        else()
+            set(LIB_SUBPATH "${LIB_SUBPATH}/MD")
        endif()
    elseif(APPLE)
        if(MNN_BUILD_SHARED_LIBS)
@ -108,34 +127,23 @@ if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
        endif()
    endif()

-    target_include_directories(mnnpybridge PRIVATE ${CMAKE_CURRENT_LIST_DIR}/src ${DEPEND_PATH}/MNN/include)
-    target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH})
-    if(APPLE AND DEPEND_AAPL_FMWK)
-        target_link_libraries(mnnpybridge PRIVATE "-framework MNN")
-        set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/MNN/lib/${LIB_SUBPATH}")
-    else()
-        target_link_libraries(mnnpybridge PRIVATE MNN)
+    find_library(MNN NAMES MNN REQUIRED PATHS ${mnn_path}/lib/${LIB_SUBPATH})
+    if(NOT DEPEND_AAPL_FMWK)
+        target_include_directories(mnnpybridge PUBLIC ${mnn_path}/include)
    endif()
+    target_link_libraries(mnnpybridge PUBLIC ${MNN})

-    if(PYMNN_USE_ALINNPYTHON)
-        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/include)
-        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH})
-        if(APPLE AND DEPEND_AAPL_FMWK)
-            target_link_libraries(mnnpybridge PRIVATE "-framework python")
-            set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/AliNNPython/lib/${LIB_SUBPATH}")
-        else()
-            target_link_libraries(mnnpybridge PRIVATE python)
-        endif()
+    find_library(python NAMES python REQUIRED PATHS ${python_path}/lib/${LIB_SUBPATH})
+    if(NOT DEPEND_AAPL_FMWK)
+            target_include_directories(mnnpybridge PUBLIC ${python_path}/include)
    endif()
+    target_link_libraries(mnnpybridge PUBLIC ${python})
    if(PYMNN_NUMPY_USABLE)
-        target_include_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/include)
-        target_link_directories(mnnpybridge PRIVATE ${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH})
-        if(APPLE AND DEPEND_AAPL_FMWK)
-            target_link_libraries(mnnpybridge PRIVATE "-framework numpy_python")
-            set_target_properties(mnnpybridge PROPERTIES LINK_FLAGS "-Wl,-F${DEPEND_PATH}/numpy/lib/${LIB_SUBPATH}")
-        else()
-            target_link_libraries(mnnpybridge PRIVATE numpy_python)
+        find_library(numpy NAMES numpy_python REQUIRED PATHS ${numpy_path}/lib/${LIB_SUBPATH})
+        if(NOT DEPEND_AAPL_FMWK)
+            target_include_directories(mnnpybridge PUBLIC ${numpy_path}/include)
        endif()
+        target_link_libraries(mnnpybridge PUBLIC ${numpy})
    endif()
 else()
    target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include)
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@ -13,7 +13,7 @@ def inference():
    config['precision'] = 'low'
    session = interpreter.createSession()
    input_tensor = interpreter.getSessionInput(session)
-    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
+    image = cv2.imread('0000.jpg')
    #cv2 read as bgr format
    image = image[..., ::-1]
    #change to rgb format
--- a/pymnn/pip_package/MNN/cv/init.py
+++ b/pymnn/pip_package/MNN/cv/init.py
@ -1,11 +1,22 @@
 from _mnncengine.cv import *
+import _mnncengine.cv as _F
 import MNN.numpy as _np
+import MNN

 def __to_int(x):
    dtype = x.dtype
    if dtype == _np.int32:
        return x
    return x.astype(_np.int32)
+def resize(src, dsize=None, fx=None, fy=None, interpolation=INTER_LINEAR, code = None, mean=[], norm=[]):
+    if dsize is None and  fx is None and fy is None:
+        raise ValueError('reisze must set dsize or fx,fy.')
+    if dsize is None: dsize = [0, 0]
+    if fx is None: fx = 0
+    if fy is None: fy = 0
+    if code is None: code = -1
+    else: code = hash(code)
+    return _F.resize(src, dsize, fx, fy, interpolation, code, mean, norm)
 def copyTo(src, mask=None, dst=None):
    if mask is None: return src.copy()
    origin_dtype = src.dtype
@ -45,3 +56,33 @@ def hconcat(src):
    return _np.concatenate(src, 1)
 def vconcat(src):
    return _np.concatenate(src, 0)
+def mean(src, mask=None):
+    if mask is not None:
+        src = copyTo(src, mask)
+    res = _np.mean(src, [0, 1])
+    if res.ndim == 0: size = 0
+    else: size = res.shape[0]
+    if size < 4:
+        res = _np.pad(res, [0, 4 - size])
+    return res
+def flip(src, flipCode):
+    h, w, c = src.shape
+    m = MNN.CVMatrix()
+    if flipCode < 0:
+        m.write([-1., 0., w-1., 0., -1., h-1.])
+    elif flipCode == 0:
+        m.write([1., 0., 0., 0., -1., h-1.])
+    else:
+        m.write([-1., 0., w-1., 0., 1., 0.])
+    return warpAffine(src, m, [w, h])
+ROTATE_90_CLOCKWISE = 0
+ROTATE_180 = 1
+ROTATE_90_COUNTERCLOCKWISE = 2
+def rotate(src, rotateMode):
+    if rotateMode == ROTATE_90_CLOCKWISE:
+        return flip(src.transpose([1, 0, 2]), 1)
+    if rotateMode == ROTATE_180:
+        return flip(src, -1)
+    if rotateMode == ROTATE_90_COUNTERCLOCKWISE:
+        return flip(src.transpose([1, 0, 2]), 0)
+    return src
--- a/pymnn/pip_package/MNN/expr/init.py
+++ b/pymnn/pip_package/MNN/expr/init.py
@ -9,23 +9,26 @@ import _mnncengine._expr as _F
 _numpy_supported = False
 try:
    import numpy as np
-    _numpy_supported = True
+    _numpy_supported = (type(np.arange(10)) == np.ndarray)
 except Exception:
    print ("Numpy not found. Using MNN without numpy.")
+
 def scalar(value, dtype=None):
-    if dtype == _F.int:
-        value = _Int(value)
-    elif dtype == _F.float:
-        value = _Float(value)
+    if dtype is not None:
+        if dtype == _F.int or dtype == _F.uint8:
+            value = _Int(value)
+        elif dtype == _F.float:
+            value = _Float(value)
+        return _F.const([value], [], _F.NCHW, dtype)
    if type(value) == type(1):
-        res = _F.const([value], [], _F.NCHW, _F.int)
-        return res
+        return _F.const([value], [], _F.NCHW, _F.int)
    elif type(value) == type(1.):
-        res = _F.const([value], [], _F.NCHW, _F.float)
-        return res
+        return _F.const([value], [], _F.NCHW, _F.float)
    else:
        raise NotImplementedError("not supported data type for creating scalar variable")
 def _list_shape_type(object, shape=()):
+    if isinstance(object, _Sequence) and len(object) == 0:
+        return [0], _F.float
    if not isinstance(object, _Sequence):
        if type(object) in (type(1), type(1<<64)):
            dst_type = _F.int
@ -54,6 +57,7 @@ def _can_broadcast(src_shape, dst_shape):
    return True
 def _match_dtype(x, y, dtype=None):
    def type_val(x):
+        if x is None: return -1
        if x == _F.double: return 4
        if x == _F.float: return 3
        if x == _F.int64: return 2
@ -76,15 +80,18 @@ def _to_var(x, dtype=None):
        return scalar(x, dtype)
    # 2. numpy
    if _numpy_supported:
-        if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
-            if x.dtype.kind == 'i':
-                x = x.astype(np.int32)
-                x = _F.const(x, x.shape, dtype=_F.int)
-            elif x.dtype.kine == 'f':
-                x = x.astype(np.float32)
-                x = _F.const(x, x.shape, dtype=_F.float)
-            else:
-                raise ValueError('Just support i/f dtype numpy.')
+        try:
+            if isinstance(x, np.ndarray): # convert numpy ndarray to MNN var
+                if x.dtype.kind == 'i':
+                    x = x.astype(np.int32)
+                    x = _F.const(x, x.shape, dtype=_F.int)
+                elif x.dtype.kind == 'f':
+                    x = x.astype(np.float32)
+                    x = _F.const(x, x.shape, dtype=_F.float)
+                else:
+                    raise ValueError('Just support i/f dtype numpy.')
+        except:
+            pass
    # 3. Sequence
    if isinstance(x, _Sequence) and x:
        dst_shape, item_type = _list_shape_type(x)
@ -202,7 +209,7 @@ def floor(x):
    >>> expr.floor([-5.1, 4.5])
    var([-6.,  4.])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.floor(x)
 def round(x):
    '''
@ -223,7 +230,7 @@ def round(x):
    >>> expr.round([-5.1, 4.5])
    var([-5.,  5.])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.round(x)
 def ceil(x):
    '''
@ -243,7 +250,7 @@ def ceil(x):
    >>> expr.ceil([-4.9, 4.5])
    var([-4.,  5.])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.ceil(x)
 def square(x):
    '''
@ -283,7 +290,7 @@ def sqrt(x):
    >>> expr.sqrt([9., 4.5])
    var([3., 2.1213202])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.sqrt(x)
 def rsqrt(x):
    '''
@ -303,7 +310,7 @@ def rsqrt(x):
    >>> expr.rsqrt([9., 4.5])
    var([0.33333334, 0.47140455])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.rsqrt(x)
 def exp(x):
    '''
@ -323,7 +330,7 @@ def exp(x):
    >>> expr.exp([9., 4.5])
    var([8102.449, 90.01698])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.exp(x)
 def log(x):
    '''
@ -343,7 +350,7 @@ def log(x):
    >>> expr.log([9., 4.5])
    var([2.1972246, 1.5040774])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.log(x)
 def sin(x):
    '''
@ -363,7 +370,7 @@ def sin(x):
    >>> expr.sin([9., 4.5])
    var([0.4121185, -0.9775301])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.sin(x)
 def sinh(x):
    '''
@ -384,7 +391,7 @@ def sinh(x):
    >>> expr.sinh([9., 4.5])
    var([4051.542, 45.00301])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.sinh(x)
 def cos(x):
    '''
@ -404,7 +411,7 @@ def cos(x):
    >>> expr.cos([9., 4.5])
    var([-0.91113025, -0.2107958])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.cos(x)
 def cosh(x):
    '''
@ -425,7 +432,7 @@ def cosh(x):
    >>> expr.cosh([9., 4.5])
    var([4051.542, 45.014122])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.cosh(x)
 def tan(x):
    '''
@ -445,7 +452,7 @@ def tan(x):
    >>> expr.tan([9., 4.5])
    var([-0.45231566, 4.637332])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.tan(x)
 def tanh(x):
    '''
@ -466,7 +473,7 @@ def tanh(x):
    >>> expr.tanh([9., 4.5])
    var([1., 0.9997533])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.tanh(x)
 def asin(x):
    '''
@ -487,7 +494,7 @@ def asin(x):
    >>> expr.asin([9., 0.5])
    var([nan, 0.5235988])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.asin(x)
 def asinh(x):
    '''
@ -508,7 +515,7 @@ def asinh(x):
    >>> expr.asinh([9., 0.5])
    var([2.893444, 0.4812118])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.asinh(x)
 def acos(x):
    '''
@ -529,7 +536,7 @@ def acos(x):
    >>> expr.asin([9., 0.5])
    var([nan, 1.0471975])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.acos(x)
 def acosh(x):
    '''
@ -550,7 +557,7 @@ def acosh(x):
    >>> expr.acosh([9., 0.5])
    var([2.887271, nan])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.acosh(x)
 def atan(x):
    '''
@ -571,7 +578,7 @@ def atan(x):
    >>> expr.atan([9., 0.5])
    var([1.4601392, 0.4636476])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.atan(x)
 def atanh(x):
    '''
@ -592,7 +599,7 @@ def atanh(x):
    >>> expr.atanh([9., 0.5])
    var([1.4601392, 0.4636476])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.atanh(x)
 def reciprocal(x):
    '''
@ -612,7 +619,7 @@ def reciprocal(x):
    >>> expr.reciprocal([9., 0.5])
    var([0.11111111, 2.])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.reciprocal(x)
 def log1p(x):
    '''
@ -632,7 +639,7 @@ def log1p(x):
    >>> expr.log1p([9., 0.5])
    var([2.3025851, 0.4054651])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.log1p(x)
 def gelu(x):
    '''
@ -652,7 +659,7 @@ def gelu(x):
    >>> expr.gelu([9., 0.5])
    var([9., 0.345714])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.gelu(x)
 def sigmoid(x):
    '''
@ -672,16 +679,16 @@ def sigmoid(x):
    >>> expr.sigmoid([9., 0.5])
    var([0.9998766, 0.62246716])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.sigmoid(x)
 def erf(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.erf(x)
 def erfc(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.erfc(x)
 def erfinv(x):
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.erfinv(x)
 def expm1(x):
    '''
@ -701,7 +708,7 @@ def expm1(x):
    >>> expr.expm1([9., 0.5])
    var([8.1014492e+03, 6.4869785e-01])
    '''
-    x = _to_var(x)
+    x = _to_var(x, _F.float)
    return _F.expm1(x)
 def add(x, y):
    '''
@ -1479,8 +1486,8 @@ def matmul(a, b, transposeA=False, transposeB=False):
    var([[0., 1.],
         [0., 3.]], dtype=float32)
    '''
-    a = _to_var(a, True)
-    b = _to_var(b, True)
+    a = _to_var(a, _F.float)
+    b = _to_var(b, _F.float)
    return _F.matmul(a, b, transposeA, transposeB)
 def normalize(x, acrossSpatial, channelShared, eps, scale):
    '''
@ -3055,7 +3062,7 @@ def zeros_like(x):
    Example:
    -------
    >>> expr.zeros_like([[1, 2], [3, 4]])
-    array([[0, 0],
+    var([[0, 0],
           [0, 0]], dtype=int32)
    '''
    x = _to_var(x)
@ -3078,14 +3085,72 @@ def range(start, limit, delta):
    Example:
    -------
    >>> expr.range(1.0, 7.0, 2.0)
-    array([1., 3., 5.], dtype=float32)
+    var([1., 3., 5.], dtype=float32)
    '''
    start = _to_var(start)
    limit = _to_var(limit)
    delta = _to_var(delta)
    if limit.dtype != start.dtype or delta.dtype != start.dtype:
-        print(start, limit, delta)
        raise RuntimeError("parameter start/limit/delta must use same data type, either all int or all float")
    return _F.range(start, limit, delta)
+def sort(x, axis=-1, arg=False, descend=False):
+    '''
+    sort(x, axis=-1, arg=False, descend=False)
+    Return the sorted array of ``x``.
+
+    Parameters
+    ----------
+    x : var_like, input value.
+    axis : int, sort by axis.
+    arg : is ArgSort or not, default is False.
+    descend : is descend or not, default is False.
+
+    Returns
+    -------
+    sorted_res : Var.
+
+    Example:
+    -------
+    >>> expr.sort([[5, 0], [1, 3]])
+    var([[1, 0],
+         [5, 3]], dtype=int32)
+    '''
+    x = _to_var(x)
+    # sort will change the x
+    x = clone(x, True)
+    return _F.sort(x, axis, arg, descend)
+def nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0):
+    '''
+    nms(boxes, scores, max_detections, iou_threshold=-1.0, score_threshold=-1.0)
+    Return the nms array of ``boxes``.
+
+    Parameters
+    ----------
+    boxes : var_like, input value, shape must be [num, 4].
+    scores : var_like, input value, shape must be [num].
+    max_detections : int.
+    iou_threshold : float, default is 0.
+    score_threshold : float, default is float_min.
+
+    Returns
+    -------
+    nms_res : Var.
+
+    Example:
+    -------
+    >>> expr.nms([[1, 1, 4, 4], [0, 0, 3, 3], [5, 5, 7, 7]], [0.9, 0.5, 0.1], 3, 0.1)
+    var([0, 2], dtype=int32)
+    '''
+    boxes = _to_var(boxes, _F.float)
+    scores = _to_var(scores, _F.float)
+    max_detections = _to_int(max_detections)
+    iou_threshold = _to_float(iou_threshold)
+    score_threshold = _to_float(score_threshold)
+    res = _F.nms(boxes, scores, max_detections, iou_threshold, score_threshold)
+    idx = res >= 0
+    idx.fix_as_const()
+    if _F.reduce_any(idx).read_as_tuple()[0] == 0:
+        return _F.const([], [0], NCHW, _F.int)
+    return res[idx]
 # TODO: detection_post_process
-# wrapper for builtin functions end
+# wrapper for builtin functions end
--- a/pymnn/pip_package/MNN/numpy/init.py
+++ b/pymnn/pip_package/MNN/numpy/init.py
@ -19,6 +19,16 @@ inf = float('inf')
 # helper functions
 def __not_impl(*args):
    raise NotImplementedError('MNN.numpy not implemet this function now.')
+def __get_arg(kargs, key, default=None):
+    if key in kargs: return kargs[key]
+    return default
+def __get_shape(args):
+    if type(args) not in (tuple, list):
+        return [args]
+    elif len(args) == 1 and type(args[0]) in (tuple, list):
+        return args[0]
+    else:
+        return args
 def __order_assert(order):
    if order is not None and order not in 'CK':
        raise RuntimeError("MNN.numpy just support order=\"C|K\"")
@ -89,6 +99,7 @@ def identity(n, dtype=float32):
    return eye(n, dtype=dtype)
 def full(shape, fill_value, dtype=None, order='C'):
    __order_assert(order)
+    shape = __get_shape(shape)
    return _F.fill(_F._to_var(shape), _F.scalar(fill_value, dtype))
 def full_like(a, fill_value, dtype=None, order='K', subok=True, shape=None):
    dst_dtype, dst_shape = __array_like_type(a, dtype, order, shape)
@ -165,10 +176,14 @@ def __arange_3(start, stop, step=1, dtype=None):
 def __arange_1(stop, dtype=None):
    return __arange_3(0, stop, 1, dtype)
 def arange(*args, **kargs):
-    if 'dtype' in kargs: dtype=kargs['dtype']
-    else: dtype = None
-    if len(args) == 1:
+    dtype = __get_arg(kargs, 'dtype')
+    step = __get_arg(kargs, 'step')
+    stop = __get_arg(kargs, 'stop')
+    start = __get_arg(kargs, 'start')
+    if len(args) == 1 and stop is None and step is None:
        return __arange_1(args[0], dtype)
+    if len(args) == 2 and step is not None:
+        return __arange_3(*args, step=step, dtype=dtype)
    if len(args) == 4:
        return __arange_3(*args)
    return __arange_3(*args, dtype=dtype)
@ -189,7 +204,26 @@ def geomspace(start, stop, num=50, endpoint=True, dtype=None, axis=0):
    base = pow(stop / _F._Float(start), 1./ num)
    start = math.log(start, base)
    return logspace(start, _F._Float(num), num, endpoint, base, dtype, axis)
-def meshgrid(xi, copy=True, sparse=False, indexing='xy'): __not_impl()
+def meshgrid(*xi, **kwargs):
+    copy = __get_arg(kwargs, 'copy', True)
+    sparse = __get_arg(kwargs, 'sparse', False)
+    indexing = __get_arg(kwargs, 'indexing', 'xy')
+    ndim = len(xi)
+    if indexing not in ['xy', 'ij']:
+        raise ValueError("Valid values for `indexing` are 'xy' and 'ij'.")
+
+    s0 = (1,) * ndim
+    output = [asanyarray(x).reshape(s0[:i] + (-1,) + s0[i + 1:]) for i, x in enumerate(xi)]
+    if indexing == 'xy' and ndim > 1:
+        # switch first and second axis
+        output[0] = swapaxes(output[0], 0, 1)
+        output[1] = swapaxes(output[1], 0, 1)
+    if not sparse:
+        # Return the full N-D matrix (not only the 1-D vector)
+        output = broadcast_arrays(*output)
+    if copy:
+        output = [x.copy() for x in output]
+    return output
 # 4. Building matrices
 def diag(v, k=0):__not_impl()
 def diagflat(v, k=0):__not_impl()
@ -212,11 +246,11 @@ def copyto(dst, src, casting='same_kind', where=True):
 def shape(a):
    return tuple(a.shape)
 # 2. Changing array shape
-def reshape(a, newshape, order='C'):
-    __order_assert(order)
+def reshape(a, *newshape):
+    newshape = __get_shape(newshape)
    return _F.reshape(a, newshape)
 def ravel(a, order='C'):
-    return reshape(a, [-1], order)
+    return reshape(a, [-1])
 # 3. Transpose-like operations
 def moveaxis(a, source, destination):
    ndim = a.ndim
@ -431,7 +465,9 @@ right_shift = packbits = unpackbits = binary_repr = base_repr = __not_impl
 # String operations [Not Impl]
 # Indexing routines
 # 1. Generating index arrays
-def where(condition, x, y):
+def where(condition, x=None, y=None):
+    if x is None and y is None:
+        return nonzero(condition)
    return _F.select(condition, x, y)
 def indices(dimensions, dtype=int32, sparse=False):__not_impl()
 def ix_(*args):__not_impl()
@ -546,6 +582,7 @@ arccosh = _F.acosh
 arctanh = _F.atanh
 around = _F.round
 round_ = _F.round
+round = _F.round
 rint = _F.round
 fix = _F.round
 floor = _F.floor
@ -685,9 +722,12 @@ def pad(array, pad_width, mode='constant'):
    return _F.pad(array, pad_width, mode)
 # Sorting, searching, and counting
 # 1. Sorting
-def sort(a, axis=- 1, kind=None, order=None):__not_impl()
-def lexsort(keys, axis=-1):__not_impl()
-def argsort(a, axis=-1, kind=None, order=None): __not_impl()
+def sort(a, axis=- 1, kind=None, order=None):
+    return _F.sort(a, axis)
+def lexsort(keys, axis=-1):
+    return sort(keys, axis)
+def argsort(a, axis=-1, kind=None, order=None):
+    return _F.sort(a, axis, True)
 def msort(a): return sort(a, axis=0)
 def sort_complex(a): __not_impl()
 def partition(a, kth, axis=- 1, kind='introselect', order=None): __not_impl()
@ -704,6 +744,7 @@ def argwhere(a):
    mask = not_equal(a, _F.scalar(0, a.dtype))
    return _F.where(mask)
 def nonzero(a):
+    res = _F.where(a)
    res = argwhere(a)
    if a.ndim == 1:
        return (ravel(res),)
@ -762,6 +803,13 @@ corrcoef = correlate = cov = __not_impl
 histogram = histogram2d = histogramdd = bincount = histogram_bin_edges = digitize = __not_impl

 # numpy ndarray functions
+def __item(self, idx):
+    if type(idx) == type(1):
+        return ravel(self)[idx]
+    elif type(idx) == tuple:
+        return self[idx]
+    else:
+        raise ValueError('item arg must be int or tuple.')
 __override_operator(_F.Var, "all", all)
 __override_operator(_F.Var, "any", any)
 __override_operator(_F.Var, "argmax", argmax)
@ -793,6 +841,7 @@ __override_operator(_F.Var, "sum", sum)
 __override_operator(_F.Var, "swapaxes", swapaxes)
 __override_operator(_F.Var, "transpose", transpose)
 __override_operator(_F.Var, "var", var)
+__override_operator(_F.Var, "item", __item)

 from . import random
-from . import linalg
+from . import linalg
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@ -15,6 +15,10 @@ USE_TRT=False
 if len(sys.argv) > 1 and sys.argv[1] == '-trt':
    USE_TRT=True

+IS_INTERNAL_BUILD = False
+if os.path.isdir('../../schema/private'):
+    IS_INTERNAL_BUILD = True
+
 def build_deps():
    """ build depency """
    root_dir = os.path.dirname(os.path.dirname(os.getcwd()))
@ -31,15 +35,16 @@ def build_deps():
    elif IS_LINUX:
        extra_opts = '-DMNN_TENSORRT=ON \
        -DCMAKE_LIBRARY_PATH=/usr/local/cuda/lib64/stubs/ ' if USE_TRT else ' '
+        extra_opts += ' -DMNN_INTERNAL=ON ' if IS_INTERNAL_BUILD else ' '
        os.system('cmake ' + extra_opts +
            '-DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
            -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-            -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
+            -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF .. && make MNN MNNTrain MNNConvert -j4')
    else:
        os.system('cmake -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TRAIN=ON -DCMAKE_BUILD_TYPE=Release\
            -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_AAPL_FMWK=OFF -DMNN_SEP_BUILD=OFF -DMNN_EXPR_SHAPE_EAGER=ON -DMNN_TRAIN_DEBUG=ON\
            -DMNN_BUILD_OPENCV=ON -DMNN_IMGCODECS=ON \
-            .. && make MNN MNNTrain MNNConvert MNNOpenCV -j4')
+            .. && make MNN MNNTrain MNNConvert -j4')
 ################################################################################
 # Building dependent libraries
 ################################################################################
--- a/pymnn/pip_package/build_wheel.py
+++ b/pymnn/pip_package/build_wheel.py
@ -8,6 +8,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
                    help='build wheel for 32bit arch, only usable on windows')
 parser.add_argument('--version', dest='version', type=str, required=True,
                    help='MNN dist version')
+parser.add_argument('--serving', dest='serving', action='store_true', default=False,
+                    help='build for internal serving, default False')
+parser.add_argument('--env', dest='env', type=str, required=False,
+                    help='build environment, e.g. :daily/pre/production')
 args = parser.parse_args()

 import os
@ -23,6 +27,8 @@ if __name__ == '__main__':
    comm_args = '--version ' + args.version
    if IS_LINUX:
        comm_args += ' --plat-name=manylinux1_x86_64'
+        comm_args += ' --env ' + args.env  if args.env else ''
+        comm_args += ' --serving' if args.serving else ''
    if IS_WINDOWS:
        os.putenv('DISTUTILS_USE_SDK', '1')
        os.putenv('MSSdk', '1')
--- a/pymnn/pip_package/setup.py
+++ b/pymnn/pip_package/setup.py
@ -10,6 +10,10 @@ parser.add_argument('--x86', dest='x86', action='store_true', default=False,
                    help='build wheel for 32bit arch, only usable on windows')
 parser.add_argument('--version', dest='version', type=str, required=True,
                    help='MNN dist version')
+parser.add_argument('--serving', dest='serving', action='store_true', default=False,
+                    help='build for internal serving, default False')
+parser.add_argument('--env', dest='env', type=str, required=False,
+                    help='build environment, e.g. :daily/pre/production')
 args, unknown = parser.parse_known_args()
 sys.argv = [sys.argv[0]] + unknown

@ -27,7 +31,7 @@ IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
 BUILD_DIR = 'pymnn_build'
-BUILD_TYPE = 'RELEASE'
+BUILD_TYPE = 'REL_WITH_DEB_INFO'
 BUILD_ARCH = 'x64'
 if args.x86:
    BUILD_ARCH = ''
@ -42,10 +46,12 @@ def report(*args):

 package_name = 'MNN'
 USE_TRT=check_env_flag('USE_TRT')
+IS_INTERNAL_BUILD = False

 print ("USE_TRT ", USE_TRT)

 if os.path.isdir('../../schema/private'):
+    IS_INTERNAL_BUILD = True
    if USE_TRT:
        print("Build Internal NNN with TRT")
        package_name = 'MNN_Internal_TRT'
@ -81,16 +87,19 @@ def configure_extension_build():
        # extra_link_args = ['/NODEFAULTLIB:LIBCMT.LIB']
        # /MD links against DLL runtime
        # and matches the flags set for protobuf and ONNX
-        # /Z7 turns on symbolic debugging information in .obj files
+        # /Zi turns on symbolic debugging information in separate .pdb (which is same as MNN.pdb)
        # /EHa is about native C++ catch support for asynchronous
        # structured exception handling (SEH)
        # /DNOMINMAX removes builtin min/max functions
        # /wdXXXX disables warning no. XXXX
-        extra_compile_args = ['/MT', '/Z7',
+        # Some macro (related with __VA_ARGS__) defined in pymnn/src/util.h can not be process correctly 
+        # becase of MSVC bug, enable /experimental:preprocessor fix it (And Windows SDK >= 10.0.18362.1)
+        extra_compile_args = ['/MT', '/Zi',
                              '/EHa', '/DNOMINMAX',
                              '/wd4267', '/wd4251', '/wd4522', '/wd4522', '/wd4838',
                              '/wd4305', '/wd4244', '/wd4190', '/wd4101', '/wd4996',
-                              '/wd4275']
+                              '/wd4275', '/experimental:preprocessor']
+        extra_link_args = []
    else:
        extra_link_args = []
        extra_compile_args = [
@ -115,7 +124,11 @@ def configure_extension_build():
        ]
        if check_env_flag('WERROR'):
            extra_compile_args.append('-Werror')
-    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API', '-DPYMNN_IMGCODECS']
+    extra_compile_args += ['-DPYMNN_EXPR_API', '-DPYMNN_NUMPY_USABLE', '-DPYMNN_OPENCV_API',  '-DPYMNN_IMGCODECS']
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        extra_compile_args += ['-DPYMNN_INTERNAL_SERVING']
+        if args.env == 'daily':
+            extra_compile_args += ['-DPYMNN_INTERNAL_SERVING_DAILY']
    root_dir = os.getenv('PROJECT_ROOT', os.path.dirname(os.path.dirname(os.getcwd())))
    engine_compile_args = ['-DBUILD_OPTYPE', '-DPYMNN_TRAIN_API']
    engine_libraries = []
@ -123,13 +136,21 @@ def configure_extension_build():
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "train")]
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "tools", "cv")]
    engine_library_dirs += [os.path.join(root_dir, BUILD_DIR, "source", "backend", "tensorrt")]
-    print(engine_library_dirs)
    if USE_TRT:
        # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
        engine_library_dirs += ['/usr/local/cuda/lib64/']

+    # Logging is enabled on Linux. Add the dependencies.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        engine_library_dirs += ['/usr/include/curl/']
+
+    print(engine_library_dirs)
    engine_link_args = []
    engine_sources = [os.path.join(root_dir, "pymnn", "src", "MNN.cc")]
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "monitor_service.cc")]
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "verify_service.cc")]
+        engine_sources += [os.path.join(root_dir, "pymnn", "src", "internal", "http_util.cc")]
    engine_include_dirs = [os.path.join(root_dir, "include")]
    engine_include_dirs += [os.path.join(root_dir, "express")]
    engine_include_dirs += [os.path.join(root_dir, "express", "module")]
@ -146,13 +167,19 @@ def configure_extension_build():
    engine_include_dirs += [os.path.join(root_dir, "schema", "current")]
    engine_include_dirs += [os.path.join(root_dir, "3rd_party",\
                                          "flatbuffers", "include")]
+    if IS_LINUX and IS_INTERNAL_BUILD and args.serving:
+        engine_include_dirs += [os.path.join(root_dir, "3rd_party", "rapidjson")]
    # cv include
    engine_include_dirs += [os.path.join(root_dir, "tools", "cv", "include")]
    engine_include_dirs += [np.get_include()]

    trt_depend = ['-lTRT_CUDA_PLUGIN', '-lnvinfer', '-lnvparsers', '-lnvinfer_plugin', '-lcudart']
    engine_depend = ['-lMNN']
-    engine_depend = ['-lMNN', '-lMNNOpenCV']
+
+    # enable logging & model authentication on linux.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        engine_depend += ['-lcurl', '-lssl', '-lcrypto']
+
    if USE_TRT:
        engine_depend += trt_depend

@ -167,6 +194,9 @@ def configure_extension_build():
        # Note: TensorRT-5.1.5.0/lib should be set in $LIBRARY_PATH of the build system.
        tools_library_dirs += ['/usr/local/cuda/lib64/']

+    if IS_LINUX and IS_INTERNAL_BUILD:
+        tools_library_dirs += ['/usr/include/curl/']
+
    tools_link_args = []
    tools_sources = [os.path.join(root_dir, "pymnn", "src", "MNNTools.cc")]
    tools_sources += [os.path.join(root_dir, "tools", "quantization",\
@ -195,61 +225,67 @@ def configure_extension_build():
    tools_include_dirs += [os.path.join(root_dir, "source")]
    tools_include_dirs += [np.get_include()]

+
    tools_depend = ['-lMNN', '-lMNNConvertDeps', '-lprotobuf']
+    # enable logging and model authentication on linux.
+    if IS_LINUX and IS_INTERNAL_BUILD:
+        tools_depend += ['-lcurl', '-lssl', '-lcrypto']

    if USE_TRT:
        tools_depend += trt_depend

-    engine_extra_link_args = []
-    tools_extra_link_args = []
    if IS_DARWIN:
-        engine_extra_link_args += ['-Wl,-all_load']
-        engine_extra_link_args += engine_depend
-        engine_extra_link_args += ['-Wl,-noall_load']
+        engine_link_args += ['-Wl,-all_load']
+        engine_link_args += engine_depend
+        engine_link_args += ['-Wl,-noall_load']
    if IS_LINUX:
-        engine_extra_link_args += ['-Wl,--whole-archive']
-        engine_extra_link_args += engine_depend
-        engine_extra_link_args += ['-fopenmp']
-        engine_extra_link_args += ['-Wl,--no-whole-archive']
+        engine_link_args += ['-Wl,--whole-archive']
+        engine_link_args += engine_depend
+        engine_link_args += ['-fopenmp']
+        engine_link_args += ['-Wl,--no-whole-archive']
    if IS_WINDOWS:
-        engine_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
+        engine_link_args += ['/WHOLEARCHIVE:MNN.lib']
    if IS_DARWIN:
-        tools_extra_link_args += ['-Wl,-all_load']
-        tools_extra_link_args += tools_depend
-        tools_extra_link_args += ['-Wl,-noall_load']
+        tools_link_args += ['-Wl,-all_load']
+        tools_link_args += tools_depend
+        tools_link_args += ['-Wl,-noall_load']
    if IS_LINUX:
-        tools_extra_link_args += ['-Wl,--whole-archive']
-        tools_extra_link_args += tools_depend
-        tools_extra_link_args += ['-fopenmp']
-        tools_extra_link_args += ['-Wl,--no-whole-archive']
-        tools_extra_link_args += ['-lz']
+        tools_link_args += ['-Wl,--whole-archive']
+        tools_link_args += tools_depend
+        tools_link_args += ['-fopenmp']
+        tools_link_args += ['-Wl,--no-whole-archive']
+        tools_link_args += ['-lz']
    if IS_WINDOWS:
-        tools_extra_link_args += ['/WHOLEARCHIVE:MNN.lib']
-        tools_extra_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
+        tools_link_args += ['/WHOLEARCHIVE:MNN.lib']
+        tools_link_args += ['/WHOLEARCHIVE:MNNConvertDeps.lib']
+        tools_link_args += ['libprotobuf.lib'] # use wholearchive will cause lnk1241 (version.rc specified)

    if BUILD_TYPE == 'DEBUG':
+        # Need pythonxx_d.lib, which seem not exist in miniconda ?
        if IS_WINDOWS:
-            extra_link_args.append('/DEBUG:FULL')
+            extra_compile_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
+            extra_link_args += ['/DEBUG', '/UNDEBUG', '/DDEBUG', '/Od', '/Ob0', '/MTd']
        else:
            extra_compile_args += ['-O0', '-g']
            extra_link_args += ['-O0', '-g']

    if BUILD_TYPE == 'REL_WITH_DEB_INFO':
        if IS_WINDOWS:
-            extra_link_args.append('/DEBUG:FULL')
+            extra_compile_args += ['/DEBUG']
+            extra_link_args += ['/DEBUG', '/OPT:REF', '/OPT:ICF']
        else:
            extra_compile_args += ['-g']
            extra_link_args += ['-g']

-
+# compat with py39
    def make_relative_rpath(path):
        """ make rpath """
        if IS_DARWIN:
-            return '-Wl,-rpath,@loader_path/' + path
+            return ['-Wl,-rpath,@loader_path/' + path]
        elif IS_WINDOWS:
-            return ''
+            return []
        else:
-            return '-Wl,-rpath,$ORIGIN/' + path
+            return ['-Wl,-rpath,$ORIGIN/' + path]

    ################################################################################
    # Declare extensions and package
@ -263,8 +299,8 @@ def configure_extension_build():
                    extra_compile_args=engine_compile_args + extra_compile_args,\
                    include_dirs=engine_include_dirs,\
                    library_dirs=engine_library_dirs,\
-                    extra_link_args=engine_extra_link_args + engine_link_args\
-                        + [make_relative_rpath('lib')])
+                    extra_link_args=engine_link_args + extra_link_args\
+                        + make_relative_rpath('lib'))
    extensions.append(engine)
    tools = Extension("_tools",\
                    libraries=tools_libraries,\
@ -273,8 +309,8 @@ def configure_extension_build():
                    extra_compile_args=tools_compile_args + extra_compile_args,\
                    include_dirs=tools_include_dirs,\
                    library_dirs=tools_library_dirs,\
-                    extra_link_args=tools_extra_link_args +tools_link_args\
-                        + [make_relative_rpath('lib')])
+                    extra_link_args=tools_link_args + extra_link_args\
+                        + make_relative_rpath('lib'))
    extensions.append(tools)
    # These extensions are built by cmake and copied manually in build_extensions()
    # inside the build_ext implementaiton
--- a/pymnn/src/MNN.cc
+++ b/pymnn/src/MNN.cc
@ -19,7 +19,9 @@ static int tls_key_2 = 0;
 #include <MNN/expr/ExecutorScope.hpp>
 #include <MNN/expr/Module.hpp>
 using namespace MNN::Express;
+#ifdef PYMNN_OPENCV_API
 #include "cv/cv.hpp"
+#endif
 #endif // PYMNN_EXPR_API

 #ifdef BUILD_OPTYPE
@ -64,6 +66,12 @@ using RegularizationMethod = ParameterOptimizer::RegularizationMethod;
 #endif
 #endif

+#ifdef PYMNN_INTERNAL_SERVING
+#include <MNN/AutoTime.hpp>
+#include "internal/monitor_service.h"
+#include "internal/verify_service.h"
+#endif
+
 struct MNN_TLSData {
    PyObject *PyMNNHalideTypeInt = NULL;
    PyObject *PyMNNHalideTypeInt64 = NULL;
@ -187,6 +195,10 @@ static PyObject* PyMNNInterpreter_new(struct _typeobject *type, PyObject *args,
 static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObject *kwds);
 static void PyMNNInterpreter_dealloc(PyMNNInterpreter *);

+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args);
+#endif
+
 static PyMethodDef PyMNNInterpreter_methods[] = {
    {"createRuntime", (PyCFunction)PyMNNInterpreter_createRuntime, METH_VARARGS | METH_STATIC, "create runtime"},
    {"createSession", (PyCFunction)PyMNNInterpreter_createSession, METH_VARARGS, "create session"},
@ -205,6 +217,9 @@ static PyMethodDef PyMNNInterpreter_methods[] = {
    {"cache", (PyCFunction)PyMNNInterpreter_cache, METH_VARARGS, "cache current net instance"},
    {"removeCache", (PyCFunction)PyMNNInterpreter_removeCache, METH_VARARGS, "remove cache with given path"},
    {"updateSessionToModel", (PyCFunction)PyMNNInterpreter_updateSessionToModel, METH_VARARGS, "updateSessionToModel"},
+#ifdef PYMNN_INTERNAL_SERVING
+    {"createSessionWithToken", (PyCFunction)PyMNNInterpreter_createSessionWithToken, METH_VARARGS, "create session with token"},
+#endif
    {NULL}  /* Sentinel */
 };

@ -681,13 +696,7 @@ static PyObject* PyMNNInterpreter_createRuntime(PyObject* self, PyObject* args)
    return res;
 }

-static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
-    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
-    PyObject* dict = NULL, *rtinfo_py = NULL;
-    if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
-        return NULL;
-    }
-
+static PyObject* createSession(PyMNNInterpreter *self, PyObject* dict, PyObject *rtinfo_py) {
    PyObject *f = importName("MNN", "Session");
    if (!f || !PyCallable_Check(f)) {
        PyErr_SetString(PyExc_Exception,
@ -715,10 +724,10 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
    }
    Session* s;
    if (rtinfo_py == NULL) {
-        s = instance->interpreter->createSession(config.second.first);
+        s = self->interpreter->createSession(config.second.first);
    } else {
        auto runtimeinfo = *(RuntimeInfo*)PyCapsule_GetPointer(rtinfo_py, NULL);
-        s = instance->interpreter->createSession(config.second.first, runtimeinfo);
+        s = self->interpreter->createSession(config.second.first, runtimeinfo);
    }
    if (!s) {
        PyErr_SetString(PyExc_Exception,
@ -727,11 +736,54 @@ static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject
    }

    session->session = s;
-    session->modelPath = instance->modelPath;
+    session->modelPath = self->modelPath;

    return (PyObject *)session;
 }

+static PyObject* PyMNNInterpreter_createSession(PyMNNInterpreter *self, PyObject *args) {
+#ifdef PYMNN_INTERNAL_SERVING
+    PyErr_SetString(PyExc_Exception,
+                        "PyMNNInterpreter_createSession: unsupported interface, should use createSessionWithToken.");
+    return NULL;
+#endif
+    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
+    PyObject* dict = NULL, *rtinfo_py = NULL;
+    if (!PyArg_ParseTuple(args, "|OO", &dict, &rtinfo_py)) {
+        return NULL;
+    }
+
+    return createSession(instance, dict, rtinfo_py);
+}
+
+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNInterpreter_createSessionWithToken(PyMNNInterpreter *self, PyObject *args) {
+    PyMNNInterpreter* instance = (PyMNNInterpreter *)self;
+    PyObject* dict = NULL, *rtinfo_py = NULL;
+    char *token = NULL;
+    char *scene = NULL;
+    char *app_key = NULL;
+    if (!PyArg_ParseTuple(args, "sss|OO", &token, &scene, &app_key, &dict, &rtinfo_py)) {
+        return NULL;
+    }
+
+    if (!token || !scene || !app_key) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNInterpreter_createSessionWithToken: input invalid, token, scene or app_key is null.");
+        return NULL;
+    }
+
+    bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
+    if (!ret) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: check token failed, return null session.");
+        return NULL;
+    }
+
+    return createSession(instance, dict, rtinfo_py);
+}
+#endif
+
 static PyObject* PyMNNInterpreter_resizeSession(PyMNNInterpreter *self, PyObject *args) {
    PyMNNSession* session = NULL;
    if (!PyArg_ParseTuple(args, "O", &session)) {
@ -826,12 +878,27 @@ static PyObject* PyMNNInterpreter_runSession(PyMNNInterpreter *self, PyObject *a
    }
    ErrorCode r = NO_ERROR;
    Py_BEGIN_ALLOW_THREADS
+
+#ifdef PYMNN_INTERNAL_SERVING
+    Timer timer;
    r = self->interpreter->runSession(session->session);
+    float cost_time = (float)timer.durationInUs() / (float)1000;
+    MNN::Interpreter::SessionInfoCode info_type = MNN::Interpreter::BACKENDS;
+    int backendType[MNN_FORWARD_ALL];
+    self->interpreter->getSessionInfo(session->session, info_type, backendType);
+    std::string mBizCode = self->interpreter->bizCode() ? self->interpreter->bizCode() : "";
+    std::string mUuid = self->interpreter->uuid() ? self->interpreter->uuid() : "";
+    MonitorService::GetInstance().Track(cost_time, std::to_string(*backendType), "RUN_SESSION",
+                                             "PyMNNInterpreter_runSession", std::to_string(r), mBizCode, mUuid);
+#else
+    r = self->interpreter->runSession(session->session);
+#endif
+
    Py_END_ALLOW_THREADS
    return PyLong_FromLong(r);
 }
 static PyMNNTensor* getTensor() {
-    PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)&PyMNNTensorType, PyTuple_New(0), NULL);
+    PyMNNTensor *tensor = (PyMNNTensor *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNTensorType), PyTuple_New(0), NULL);
    if (tensor) {
        tensor->tensor = nullptr;
    }
@ -1222,6 +1289,12 @@ static int PyMNNInterpreter_init(PyMNNInterpreter *self, PyObject *args, PyObjec
        return -1;
    }

+#ifdef PYMNN_INTERNAL_SERVING
+    // initialize MonitorService
+    MonitorService::GetInstance().Start();
+    VerifyService::GetInstance().Start();
+#endif
+
    return 0;
 }

@ -1315,7 +1388,7 @@ static PyObject* PyMNNSession_removeCache(PyMNNSession *self, PyObject *args) {

 /// MNN Tensor implementation
 bool isTensor(PyObject* t) {
-    return PyObject_IsInstance(t, (PyObject*)&PyMNNTensorType);
+    return PyObject_IsInstance(t, (PyObject*)PyType_FindTLSType(&PyMNNTensorType));
 }
 Tensor* toTensor(PyObject* t) {
    return ((PyMNNTensor*)t)->tensor;
@ -1337,17 +1410,32 @@ static void PyMNNTensor_dealloc(PyMNNTensor *self) {

 static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
    int argc = PyTuple_Size(args);
-    PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr;
-    long dimensionType;
+    PyObject *shape, *dataType, *data = nullptr, *input_tensor = nullptr, *input_var = nullptr;
+    long dimensionType = -1;
    bool parse_res = false;
    switch (argc) {
        case 0:
            // just return, using in `PyMNNInterpreter_getSessionInputAll`;
            return 0;
+#ifdef PYMNN_EXPR_API
+        case 1:
+            parse_res = PyArg_ParseTuple(args, "O", &input_var)
+                        && isVar(input_var);
+            break;
+        case 2:
+            parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
+                        && (isTensor(input_tensor) || isVar(input_tensor));
+            if (isVar(input_tensor)) {
+                input_var = input_tensor;
+                input_tensor = nullptr;
+            }
+            break;
+#else
        case 2:
            parse_res = PyArg_ParseTuple(args, "Ol", &input_tensor, &dimensionType)
                        && isTensor(input_tensor);
            break;
+#endif
        case 3:
            parse_res = PyArg_ParseTuple(args, "OOl", &shape, &dataType, &dimensionType)
                        && isInts(shape);
@ -1361,11 +1449,35 @@ static int PyMNNTensor_init(PyMNNTensor *self, PyObject *args, PyObject *kwds) {
    }
    if (!parse_res) {
        PyMNN_ERROR_LOG("Tensor init require args as belows:\n"
-                        "\t1. (Tensor, DimensionType)\n"
+                        "\t0. (Var)\n"
+                        "\t1. (Tensor/Var, DimensionType)\n"
                        "\t2. ([int], DataType, DimensionType)\n"
                        "\t3. ([int], DataType, tuple/ndarray, DimensionType)\n");
+        return -1;
    }
-
+#ifdef PYMNN_EXPR_API
+    // 0. create Tensor by Var
+    if (input_var) {
+        auto var = toVar(input_var);
+        auto info = var->getInfo();
+        void* ptr = const_cast<void*>(var->readMap<void>());
+        Tensor::DimensionType type = Tensor::TENSORFLOW;
+        if (dimensionType < 0) {
+            if (info->order == NCHW) type = Tensor::CAFFE;
+            else if (info->order == NC4HW4) type = Tensor::CAFFE_C4;
+        } else {
+            type = static_cast<Tensor::DimensionType>(dimensionType);
+        }
+        Tensor *tensor = Tensor::create(info->dim, info->type, ptr, type);
+        if (!tensor) {
+            PyMNN_ERROR_LOG("PyMNNTensor_create: Tensor create failed");
+            return -1;
+        }
+        self->tensor = tensor;
+        self->owner = 2;
+        return 0;
+    }
+#endif
    // 1. create Tensor by Tensor
    if (input_tensor) {
        Tensor *tensor = new Tensor(toTensor(input_tensor), (Tensor::DimensionType)dimensionType, true);
@ -1809,8 +1921,12 @@ static PyObject* PyMNNCVImageProcess_convert(PyMNNCVImageProcess *self, PyObject
        return NULL;
    }

-    if (PyLong_Check(source)) {
-        ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(PyLong_AsLong(source)),
+    if (isInt(source)) {
+        auto ptr = PyLong_AsVoidPtr(source);
+        if (ptr == NULL) {
+            Py_RETURN_NONE;
+        }
+        ErrorCode ret = self->imageProcess->convert(reinterpret_cast<const uint8_t *>(ptr),
                                                    iw, ih, stride,
                                                    ((PyMNNTensor *)dest)->tensor);
        return PyLong_FromLong(ret);
@ -1949,46 +2065,70 @@ static PyObject* PyMNNCVImageProcess_setPadding(PyMNNCVImageProcess *self, PyObj

 /// MNN CVMatrix implementation
 bool isMatrix(PyObject* obj) {
-    return PyObject_IsInstance(obj, (PyObject*)&PyMNNCVMatrixType);
+    return PyObject_IsInstance(obj, (PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType));
 }
 CV::Matrix toMatrix(PyObject* obj) {
    return *(((PyMNNCVMatrix*)obj)->matrix);
 }
 PyObject* toPyObj(CV::Matrix m) {
-    PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)&PyMNNCVMatrixType, PyTuple_New(0), NULL);
+    PyMNNCVMatrix *ret = (PyMNNCVMatrix *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNCVMatrixType), PyTuple_New(0), NULL);
    ret->matrix = new CV::Matrix();
    *(ret->matrix) = m;
    return (PyObject*)ret;
 }
-bool isSize(PyObject* obj) {
-    return (isInts(obj) && toInts(obj).size() == 2);
-}
-CV::Size toSize(PyObject* obj) {
-    auto vals = toInts(obj);
-    MNN_ASSERT(val.size() == 2);
-    return CV::Size(vals[0], vals[1]);
-}
+
 bool isPoint(PyObject* obj) {
-    return (isFloats(obj) && toFloats(obj).size() == 2);
+    return (isFloats(obj) && toFloats(obj).size() == 2) ||
+           (isInts(obj) && toInts(obj).size() == 2);
 }
 CV::Point toPoint(PyObject* obj) {
-    auto vals = toFloats(obj);
-    MNN_ASSERT(val.size() == 2);
    CV::Point point;
-    point.set(vals[0], vals[1]);
+    if (isFloats(obj)) {
+        auto vals = toFloats(obj);
+        MNN_ASSERT(val.size() == 2);
+        point.set(vals[0], vals[1]);
+    } else if (isInts(obj)) {
+        auto vals = toInts(obj);
+        MNN_ASSERT(val.size() == 2);
+        point.set(vals[0], vals[1]);
+    }
    return point;
 }
 bool isPoints(PyObject* obj) {
-    return (isFloats(obj) && toFloats(obj).size() % 2 == 0);
+    return (isFloats(obj) && toFloats(obj).size() % 2 == 0) ||
+           (isInts(obj) && toInts(obj).size() % 2 == 0) || isVar(obj);
 }
 std::vector<CV::Point> toPoints(PyObject* obj) {
-    auto vals = toFloats(obj);
-    MNN_ASSERT(val.size() % 2 == 0);
-    std::vector<CV::Point> points(vals.size() / 2);
-    for (int i = 0; i < points.size(); i++) {
-        points[i].set(vals[i*2], vals[i*2+1]);
+    if (isFloats(obj)) {
+        auto vals = toFloats(obj);
+        MNN_ASSERT(vals.size() % 2 == 0);
+        std::vector<CV::Point> points(vals.size() / 2);
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(vals[i*2], vals[i*2+1]);
+        }
+        return points;
    }
-    return points;
+    if (isInts(obj)) {
+        auto vals = toInts(obj);
+        MNN_ASSERT(vals.size() % 2 == 0);
+        std::vector<CV::Point> points(vals.size() / 2);
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(vals[i*2], vals[i*2+1]);
+        }
+        return points;
+    }
+    if (isVar(obj)) {
+        auto vals = toVar(obj);
+        auto size = vals->getInfo()->size;
+        MNN_ASSERT(size % 2 == 0);
+        std::vector<CV::Point> points(size / 2);
+        auto ptr = vals->readMap<float>();
+        for (int i = 0; i < points.size(); i++) {
+            points[i].set(ptr[i*2], ptr[i*2+1]);
+        }
+        return points;
+    }
+    return {};
 }
 PyObject* toPyObj(std::vector<CV::Point> _points) {
    std::vector<float> points(_points.size() * 2);
@ -2494,7 +2634,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
        PyErr_SetString(PyExc_Exception, "initMNN.expr: PyType_Ready PyMNNVarType failed");
        ERROR_RETURN
    }
-    PyModule_AddObject(expr_module, "Var", (PyObject *)&PyMNNVarType);
+    PyModule_AddObject(expr_module, "Var", (PyObject *)PyType_FindTLSType(&PyMNNVarType));
    // def enum
    def_data_format(expr_module);
    def_dtype(expr_module);
@ -2547,6 +2687,7 @@ PyMODINIT_FUNC MOD_INIT_FUNC(void) {
    def_ThresholdTypes(cv_module);
    def_RetrievalModes(cv_module);
    def_ContourApproximationModes(cv_module);
+    def_LineTypes(cv_module);
    // add methods of cv
    constexpr int cv_method_num = sizeof(PyMNNCV_methods) / sizeof(PyMethodDef);
    for (int i = 0; i < cv_method_num; i++) {
@ -2571,6 +2712,10 @@ void loadMNN() {
        WeImport_AppendInittab(MOD_NAME, MOD_INIT_FUNC);
    });
 }
+void* memoryToVar(const void* ptr, int h, int w, int c, int type) {
+    auto var = Express::_Const(ptr, {h, w, c}, NHWC, dtype2htype(static_cast<DType>(type)));
+    return reinterpret_cast<void*>(toPyObj(var));
+}
 static auto registerMNN = []() {
    loadMNN();
    return true;
--- a/pymnn/src/MNNPyBridge.h
+++ b/pymnn/src/MNNPyBridge.h
@ -17,4 +17,12 @@
 #define PYMNN_PUBLIC
 #endif // WIN32

-extern "C" PYMNN_PUBLIC void loadMNN();
+// memoryToVar's type define
+#define TypeFloat 1
+#define TypeDouble 2
+#define TypeInt 3
+#define TypeUint8 4
+#define TypeInt8 6
+#define TypeInt64 9
+extern "C" PYMNN_PUBLIC void loadMNN();
+extern "C" PYMNN_PUBLIC void* memoryToVar(void* ptr, int h, int w, int c, int type);
--- a/pymnn/src/cv.h
+++ b/pymnn/src/cv.h
@ -99,10 +99,22 @@ def_enum(ContourApproximationModes, CV::ContourApproximationModes,
        CV::CHAIN_APPROX_TC89_L1, "CHAIN_APPROX_TC89_L1",
        CV::CHAIN_APPROX_TC89_KCOS, "CHAIN_APPROX_TC89_KCOS"
        )
+def_enum(LineTypes, CV::LineTypes,
+        CV::FILLED, "FILLED",
+        CV::LINE_4, "LINE_4",
+        CV::LINE_8, "LINE_8",
+        CV::LINE_AA, "LINE_AA"
+        )
 // helper functions
 INTS default_size = {0, 0}, default_param = {};
-bool isSize(PyObject* obj);
-CV::Size toSize(PyObject* obj);
+bool isSize(PyObject* obj) {
+    return (isInts(obj) && toInts(obj).size() == 2);
+}
+CV::Size toSize(PyObject* obj) {
+    auto vals = toInts(obj);
+    MNN_ASSERT(val.size() == 2);
+    return CV::Size(vals[0], vals[1]);
+}
 bool isPoint(PyObject* obj);
 CV::Point toPoint(PyObject* obj);
 bool isPoints(PyObject* obj);
@ -378,24 +390,28 @@ static PyObject* PyMNNCV_invertAffineTransform(PyObject *self, PyObject *args) {
    }
    PyMNN_ERROR("invertAffineTransform require args: (Matrix)");
 }
+std::vector<float> default_floats = {};
 static PyObject* PyMNNCV_resize(PyObject *self, PyObject *args) {
-    PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR);
+    PyObject *src, *dsize, *interpolation = toPyObj(CV::INTER_LINEAR),
+             *mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
    float fx = 0, fy = 0;
-    if (PyArg_ParseTuple(args, "OO|ffO", &src, &dsize, &fx, &fy, &interpolation) &&
-        isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation)) {
-        return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation)));
+    int code = -1;
+    if (PyArg_ParseTuple(args, "OO|ffOiOO", &src, &dsize, &fx, &fy, &interpolation, &code, &mean, &norm) &&
+        isVar(src) && isSize(dsize) && isInterpolationFlags(interpolation) && isFloats(mean) && isFloats(norm)) {
+        return toPyObj(CV::resize(toVar(src), toSize(dsize), fx, fy, toEnum<CV::InterpolationFlags>(interpolation), code, toFloats(mean), toFloats(norm)));
    }
-    PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags)");
+    PyMNN_ERROR("resize require args: (Var, [int], |float, float, InterpolationFlags, int, [float], [float])");
 }
 static PyObject* PyMNNCV_warpAffine(PyObject *self, PyObject *args) {
-    PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
-    int borderValue = 0;
-    if (PyArg_ParseTuple(args, "OOO|OOi", &src, &M, &dsize, &flag, &borderMode, &borderValue) &&
-        isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode)) {
+    PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT),
+             *mean = toPyObj(default_floats), *norm = toPyObj(default_floats);
+    int borderValue = 0, code = -1;
+    if (PyArg_ParseTuple(args, "OOO|OOiiOO", &src, &M, &dsize, &flag, &borderMode, &borderValue, &code, &mean, &norm) &&
+        isVar(src) && isMatrix(M) && isSize(dsize) && isInterpolationFlags(flag) && isBorderTypes(borderMode) && isFloats(mean) && isFloats(norm)) {
        return toPyObj(CV::warpAffine(toVar(src), toMatrix(M), toSize(dsize),
-                       toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue));
+                       toEnum<CV::InterpolationFlags>(flag), toEnum<CV::BorderTypes>(borderMode), borderValue, code, toFloats(mean), toFloats(norm)));
    }
-    PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int)");
+    PyMNN_ERROR("warpAffine require args: (Var, Matrix, [int], |InterpolationFlags, BorderTypes, int, int, [float], [float])");
 }
 static PyObject* PyMNNCV_warpPerspective(PyObject *self, PyObject *args) {
    PyObject *src, *M, *dsize, *flag = toPyObj(CV::INTER_LINEAR), *borderMode = toPyObj(CV::BORDER_CONSTANT);
@ -433,7 +449,7 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
        auto contours = CV::findContours(toVar(image), toEnum<CV::RetrievalModes>(mode),
                                         toEnum<CV::ContourApproximationModes>(method), toPoint(offset));
        PyObject* obj = PyTuple_New(2);
-        PyTuple_SetItem(obj, 0, toPyObj<std::vector<CV::Point>, toPyObj>(contours));
+        PyTuple_SetItem(obj, 0, toPyObj<VARP, toPyObj>(contours));
        PyTuple_SetItem(obj, 1, toPyObj("no hierarchy"));
        return obj;
    }
@ -442,24 +458,29 @@ static PyObject* PyMNNCV_findContours(PyObject *self, PyObject *args) {
 static PyObject* PyMNNCV_contourArea(PyObject *self, PyObject *args) {
    PyObject *points;
    int oriented = 0;
-    if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isPoints(points)) {
-        float area = CV::contourArea(toPoints(points), oriented);
-        return toPyObj(area);
+    if (PyArg_ParseTuple(args, "O|i", &points, &oriented) && isVar(points)) {
+        float res = CV::contourArea(toVar(points), oriented);
+        return toPyObj(res);
    }
-    PyMNN_ERROR("contourArea require args: ([float], |bool)");
+    PyMNN_ERROR("contourArea require args: (Var, |bool)");
 }
 static PyObject* PyMNNCV_convexHull(PyObject *self, PyObject *args) {
    PyObject *points;
    int clockwise = 0, returnPoints = 1;
-    if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isPoints(points)) {
-        return toPyObj(CV::convexHull(toPoints(points), clockwise, returnPoints));
+    if (PyArg_ParseTuple(args, "O|ii", &points, &clockwise, &returnPoints) && isVar(points)) {
+        auto res = CV::convexHull(toVar(points), clockwise, returnPoints);
+        if (returnPoints) {
+            int npoints = res.size() / 2;
+            return toPyObj(Express::_Const(res.data(), { npoints, 1, 2 }, NHWC, halide_type_of<int>()));
+        }
+        return toPyObj(res);
    }
-    PyMNN_ERROR("convexHull require args: ([float], |bool, bool)");
+    PyMNN_ERROR("convexHull require args: (Var, |bool, bool)");
 }
 static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
    PyObject *points;
-    if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
-        auto rect = CV::minAreaRect(toPoints(points));
+    if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
+        auto rect = CV::minAreaRect(toVar(points));
        PyObject* center = PyTuple_New(2);
        PyTuple_SetItem(center, 0, toPyObj(rect.center.x));
        PyTuple_SetItem(center, 1, toPyObj(rect.center.y));
@ -472,16 +493,16 @@ static PyObject* PyMNNCV_minAreaRect(PyObject *self, PyObject *args) {
        PyTuple_SetItem(obj, 2, toPyObj(rect.angle));
        return obj;
    }
-    PyMNN_ERROR("minAreaRect require args: ([float])");
+    PyMNN_ERROR("minAreaRect require args: (Var)");
 }
 static PyObject* PyMNNCV_boundingRect(PyObject *self, PyObject *args) {
    PyObject *points;
-    if (PyArg_ParseTuple(args, "O", &points) && isPoints(points)) {
-        auto rect = CV::boundingRect(toPoints(points));
+    if (PyArg_ParseTuple(args, "O", &points) && isVar(points)) {
+        auto rect = CV::boundingRect(toVar(points));
        std::vector<int> res { rect.x, rect.y, rect.width, rect.height };
        return toPyObj(res);
    }
-    PyMNN_ERROR("boundingRect require args: ([float])");
+    PyMNN_ERROR("boundingRect require args: (Var)");
 }
 static PyObject* PyMNNCV_connectedComponentsWithStats(PyObject *self, PyObject *args) {
    PyObject *image;
@ -518,17 +539,106 @@ static PyObject* PyMNNCV_boxPoints(PyObject *self, PyObject *args) {
 error_:
    PyMNN_ERROR("boxPoints require args: [(float, (float, float), (float, float))])");
 }
+// draw
+static bool isColor(PyObject* obj) {
+    return (isInts(obj) && (toInts(obj).size() == 3 || toInts(obj).size() == 4));
+}
+CV::Scalar toColor(PyObject* obj) {
+    auto vals = toInts(obj);
+    if (vals.size() == 3) {
+        return CV::Scalar(vals[0], vals[1], vals[2]);
+    }
+    if (vals.size() == 4) {
+        return CV::Scalar(vals[0], vals[1], vals[2], vals[3]);
+    }
+    return CV::Scalar(255, 255, 255);
+}
+static PyObject* PyMNNCV_line(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::line(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                 thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("line require args: (Var, Point, Point, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_arrowedLine(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    float tipLength = 0.1;
+    if (PyArg_ParseTuple(args, "OOOO|iOif", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift, &tipLength)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::arrowedLine(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                        thickness, toEnum<CV::LineTypes>(linetype), shift, tipLength);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("arrowedLine require args: (Var, Point, Point, Color, |int, LineType, int, float)");
+}
+static PyObject* PyMNNCV_circle(PyObject *self, PyObject *args) {
+    PyObject *img, *center, *color, *linetype = toPyObj(CV::LINE_8);
+    int radius, thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOiO|iOi", &img, &center, &radius, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(center) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::circle(image, toPoint(center), radius, toColor(color),
+                 thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("circle require args: (Var, Point, int, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_rectangle(PyObject *self, PyObject *args) {
+    PyObject *img, *pt1, *pt2, *color, *linetype = toPyObj(CV::LINE_8);
+    int thickness = 1, shift = 0;
+    if (PyArg_ParseTuple(args, "OOOO|iOi", &img, &pt1, &pt2, &color, &thickness, &linetype, &shift)
+        && isVar(img) && isPoint(pt1) && isPoint(pt2) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::rectangle(image, toPoint(pt1), toPoint(pt2), toColor(color),
+                      thickness, toEnum<CV::LineTypes>(linetype), shift);
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("rectangle require args: (Var, Point, Point, Color, |int, LineType, int)");
+}
+static PyObject* PyMNNCV_drawContours(PyObject *self, PyObject *args) {
+    PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8);
+    int contourIdx, thickness = 1;
+    if (PyArg_ParseTuple(args, "OOiO|iO", &img, &contours, &contourIdx, &color, &thickness, &linetype)
+        && isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype)) {
+        auto image = toVar(img);
+        CV::drawContours(image, toVec<std::vector<CV::Point>, toPoints>(contours), contourIdx, toColor(color),
+                         thickness, toEnum<CV::LineTypes>(linetype));
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("drawContours require args: (Var, [Points], int, Color, |int, LineType)");
+}
+static PyObject* PyMNNCV_fillPoly(PyObject *self, PyObject *args) {
+    PyObject *img, *contours, *color, *linetype = toPyObj(CV::LINE_8), *offset = toPyObj(std::vector<float>{0, 0});
+    int shift = 0;
+    if (PyArg_ParseTuple(args, "OOO|OiO", &img, &contours, &color, &linetype, &shift, &offset)
+        && isVar(img) && isVec<isPoints>(contours) && isColor(color) && isLineTypes(linetype) && isPoint(offset)) {
+        auto image = toVar(img);
+        CV::fillPoly(image, toVec<std::vector<CV::Point>, toPoints>(contours), toColor(color),
+                     toEnum<CV::LineTypes>(linetype), shift, toPoint(offset));
+        Py_RETURN_NONE;
+    }
+    PyMNN_ERROR("fillPoly require args: (Var, [Points], Color, |LineType, int, Point)");
+}
 static PyMethodDef PyMNNCV_methods[] = {
-    register_methods(CV,
 #ifdef PYMNN_IMGCODECS
+    register_methods(CV,
        // imgcodecs
        haveImageReader, "haveImageReader",
        haveImageWriter, "haveImageWriter",
        imdecode, "imdecode",
        imencode, "imencode",
        imread, "imread",
-        imwrite, "imwrite",
+        imwrite, "imwrite"
+    )
 #endif
+    register_methods(CV,
        // color
        cvtColor, "cvtColor.",
        cvtColorTwoPlane, "cvtColorTwoPlane.",
@ -569,6 +679,13 @@ static PyMethodDef PyMNNCV_methods[] = {
        minAreaRect, "minAreaRect",
        boundingRect, "boundingRect",
        connectedComponentsWithStats, "connectedComponentsWithStats",
-        boxPoints, "boxPoints"
+        boxPoints, "boxPoints",
+        // draw
+        line, "line",
+        arrowedLine, "arrowedLine",
+        circle, "circle",
+        rectangle, "rectangle",
+        drawContours, "drawContours",
+        fillPoly, "fillPoly"
    )
 };
--- a/pymnn/src/expr.h
+++ b/pymnn/src/expr.h
@ -63,6 +63,7 @@ def_enum(PrecisionMode, PrecisionMode,
 typedef struct {
    PyObject_HEAD
    VARP* var;
+    int iter_index;
 } PyMNNVar;
 static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
 static void PyMNNVar_dealloc(PyMNNVar *self);
@ -137,6 +138,9 @@ static PyObject* PyMNNVar_negative(PyObject*);
 static PyObject* PyMNNVar_absolute(PyObject*);
 static Py_ssize_t PyMNNVar_length(PyObject*);
 static PyObject* PyMNNVar_subscript(PyObject*, PyObject*);
+static int PyMNNVar_ass_subscript(PyObject*, PyObject*, PyObject*);
+static PyObject* PyMNNVar_iter(PyObject*);
+static PyObject* PyMNNVar_iternext(PyObject*);
 #if PY_MAJOR_VERSION >= 3
 static PyNumberMethods PyMNNVar_as_number = {
    PyMNNVar_add,           /*nb_add*/
@ -220,9 +224,9 @@ static PyNumberMethods PyMNNVar_as_number = {
 };
 #endif
 static PyMappingMethods PyMNNVar_as_mapping = {
-    PyMNNVar_length,    /*mp_length*/
-    PyMNNVar_subscript, /*mp_subscript*/
-    0,                  /*mp_ass_subscript*/
+    PyMNNVar_length,        /*mp_length*/
+    PyMNNVar_subscript,     /*mp_subscript*/
+    PyMNNVar_ass_subscript, /*mp_ass_subscript*/
 };
 PyObject *PyMNNVar_richcompare(PyObject *self, PyObject *other, int op);
 static PyTypeObject PyMNNVarType = {
@ -256,8 +260,8 @@ static PyTypeObject PyMNNVarType = {
    0,                                        /*tp_clear*/
    &PyMNNVar_richcompare,                    /*tp_richcompare*/
    0,                                        /*tp_weaklistoffset*/
-    0,                                        /*tp_iter*/
-    0,                                        /*tp_iternext*/
+    &PyMNNVar_iter,                           /*tp_iter*/
+    &PyMNNVar_iternext,                       /*tp_iternext*/
    PyMNNVar_methods,                         /*tp_methods*/
    0,                                        /*tp_members*/
    PyMNNVar_getsetters,                      /*tp_getset*/
@ -272,7 +276,7 @@ static PyTypeObject PyMNNVarType = {
 };
 // helper functions
 static PyMNNVar* getVar() {
-    PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)&PyMNNVarType, PyTuple_New(0), NULL);
+    PyMNNVar *var = (PyMNNVar *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNNVarType), PyTuple_New(0), NULL);
    var->var = new VARP;
    return var;
 }
@ -284,7 +288,7 @@ static PyObject* toPyObj(VARP var) {
 static bool isVar(PyObject* var) {
    return isInt(var) || isInts(var) ||
           isFloat(var) || isFloats(var) ||
-           PyObject_IsInstance(var, (PyObject*)&PyMNNVarType);
+           Py_TYPE(var) == PyType_FindTLSType(&PyMNNVarType);
 }
 static bool isVars(PyObject* var) {
    return isVec<isVar>(var);
@ -353,21 +357,30 @@ std::pair<VARP, VARP> toVarPair(PyObject* l, PyObject* r, bool fp = false) {
 PyObject *PyMNNVar_richcompare(PyObject *l, PyObject *r, int op) {
    auto lr = toVarPair(l, r);
    auto vl = lr.first, vr = lr.second;
+    VARP res;
    switch (op) {
        case Py_LT:
-            return toPyObj(Express::_Less(vl, vr));
+            res = Express::_Less(vl, vr);
+            break;
        case Py_LE:
-            return toPyObj(Express::_LessEqual(vl, vr));
+            res = Express::_LessEqual(vl, vr);
+            break;
        case Py_EQ:
-            return toPyObj(Express::_Equal(vl, vr));
+            res = Express::_Equal(vl, vr);
+            break;
        case Py_NE:
-            return toPyObj(Express::_NotEqual(vl, vr));
+            res = Express::_NotEqual(vl, vr);
+            break;
        case Py_GT:
-            return toPyObj(Express::_Greater(vl, vr));
+            res = Express::_Greater(vl, vr);
+            break;
        case Py_GE:
-            return toPyObj(Express::_GreaterEqual(vl, vr));
+            res = Express::_GreaterEqual(vl, vr);
+            break;
+        default:
+            Py_RETURN_NONE;
    }
-    Py_RETURN_NONE;
+    return toPyObj(res);
 }
 static PyObject* PyMNNVar_add(PyObject* l, PyObject* r) {
    auto lr = toVarPair(l, r);
@ -413,11 +426,10 @@ static Py_ssize_t PyMNNVar_length(PyObject* x) {
    }
    return size;
 }
-static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
-    std::vector<int> begin, end, strides;
-    int new_axis_mask = 0, shrink_axis_mask = 0,
-        begin_mask = 0, end_mask = 0,
-        ellipsis_mask = 0, index = 0;
+
+static void dealSlice(PyObject* slice, std::vector<int>& begin, std::vector<int>& end, std::vector<int>& strides,
+                      int& new_axis_mask, int& shrink_axis_mask, int& begin_mask, int& end_mask, int& ellipsis_mask) {
+    int index = 0;
    auto dealItem = [&](PyObject* item) {
        if (PySlice_Check(item)) {
            Py_ssize_t startl = 0, stopl = 0, stepl = 1;
@ -437,7 +449,7 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
            if ((step == 1 && start == 0) || (step == -1 && start == -1)) {
                begin_mask |= (1 << index);
            }
-            if ((step == 1 && stop == -1) || (step == -1 && stop == 0)) {
+            if ((step == 1 && stop == -1) || (step == -1 && stop == 0) || PY_SSIZE_T_MAX == stopl) {
                end_mask |= (1 << index);
            }
        }
@ -471,16 +483,136 @@ static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
    } else {
        dealItem(slice);
    }
+}
+static inline bool isIdx(PyObject* slice) {
+    return Py_TYPE(slice) == PyType_FindTLSType(&PyMNNVarType) || (PyList_Check(slice) && isInts(slice));
+}
+static bool isBoolIdx(VARP idx, int reqSize) {
+    auto size = idx->getInfo()->size;
+    bool isbool = (size == reqSize);
+    if (isbool) {
+        auto ptr = idx->readMap<int>();
+        for (int i = 0; i < size; i++) {
+            if (ptr[i] != 0 && ptr[i] != 1) {
+                return false;
+            }
+        }
+    }
+    return isbool;
+}
+static PyObject* PyMNNVar_subscript(PyObject* x, PyObject* slice) {
+    // gather: 1. 0-1 gather; 2. idx gather;
+    if (isIdx(slice)) {
+        auto val = toVar(x);
+        auto idx = toVar(slice);
+        if (val->getInfo()->size > 1 && isBoolIdx(idx, val->getInfo()->size)) {
+            // 0-1 gather -> idx gather
+            idx = Express::_Where(idx);
+            val = Express::_GatherND(val, idx);
+            val = Express::_Reshape(val, {-1});
+            return toPyObj(val);
+        }
+        auto r = Express::_Gather(val, idx);
+        r->readMap<void>();
+        return toPyObj(r);
+    }
+
+    std::vector<int> begin, end, strides;
+    int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
+    dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
    int size_ = static_cast<int>(begin.size());
    auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
    auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
    auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
-    return toPyObj(Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
-                                          ellipsis_mask, new_axis_mask, shrink_axis_mask));
+    auto res = Express::_StridedSlice(toVar(x), begin_, end_, strides_, begin_mask, end_mask,
+                                      ellipsis_mask, new_axis_mask, shrink_axis_mask);
+    auto info = res->getInfo();
+    if (!info) {
+        PyMNN_ERROR("subscript: unable to get variable info");
+    }
+    // to scalar
+    if (info->dim.empty()) {
+        auto dtype = info->type;
+        if (dtype == halide_type_of<float>()) {
+            return toPyObj(res->readMap<float>()[0]);
+        }
+        if (dtype == halide_type_of<int>()) {
+            return toPyObj(res->readMap<int>()[0]);
+        }
+        if (dtype == halide_type_of<uint8_t>()) {
+            return toPyObj(res->readMap<uint8_t>()[0]);
+        }
+        if (dtype == halide_type_of<double>()) {
+            return toPyObj((float)res->readMap<double>()[0]);
+        }
+    }
+    return toPyObj(res);
+}
+
+static int PyMNNVar_ass_subscript(PyObject* x, PyObject* slice, PyObject* y) {
+    if (!isVar(x) || !isVar(y)) {
+        PyMNN_ERROR_LOG("ass_subscript require args: (Var, int/Var, int/float/Var)");
+        return -1;
+    }
+    auto var = toVar(x);
+    auto val = toVar(y);
+    auto varInfo = var->getInfo();
+    if (isIdx(slice)) {
+        auto idx = toVar(slice);
+        if (isBoolIdx(idx, varInfo->size)) {
+            idx = Express::_Where(idx);
+        }
+        auto idxDim = idx->getInfo()->dim;
+        int scatterNum = idxDim[0], scatterDim = 1;
+        if (idxDim.size() < 2) {
+            idx = Express::_Unsqueeze(idx, {-1});
+        } else {
+            scatterDim = idxDim[1];
+        }
+        // val broadcast_to [scatterNum, (scatterDim < varDim.size() ? varDim[scatterDim:] : 1)]
+        auto varDim = varInfo->dim;
+        std::vector<int> valDim(1, scatterNum);
+        if (scatterDim >= varDim.size()) {
+            valDim.push_back(1);
+        } else {
+            for (int i = scatterDim; i < varDim.size(); i++) {
+                valDim.push_back(varDim[i]);
+            }
+        }
+        val = Express::_BroadcastTo(val, _Const(valDim.data(), {static_cast<int>(valDim.size())}, NCHW, halide_type_of<int32_t>()));
+        *(((PyMNNVar*)x)->var) = Express::_ScatterNd(idx, val, Express::_Shape(var), var);
+        return 0;
+    }
+    std::vector<int> begin, end, strides;
+    int new_axis_mask = 0, shrink_axis_mask = 0, begin_mask = 0, end_mask = 0, ellipsis_mask = 0;
+    dealSlice(slice, begin, end, strides, new_axis_mask, shrink_axis_mask, begin_mask, end_mask, ellipsis_mask);
+    int size_ = static_cast<int>(begin.size());
+    auto begin_ = Express::_Const(begin.data(), {size_}, NHWC, halide_type_of<int>());
+    auto end_ = Express::_Const(end.data(), {size_}, NHWC, halide_type_of<int>());
+    auto strides_ = Express::_Const(strides.data(), {size_}, NHWC, halide_type_of<int>());
+    *(((PyMNNVar*)x)->var) = Express::_StridedSliceWrite(var, begin_, end_, strides_, val, begin_mask, end_mask,
+                                                         ellipsis_mask, new_axis_mask, shrink_axis_mask);
+    return 0;
+}
+static PyObject* PyMNNVar_iter(PyObject *self) {
+    auto var = toVar(self);
+    if (var->getInfo()->dim.empty()) {
+        PyMNN_ERROR("iteration over a 0-d array");
+    }
+    Py_INCREF(self);
+    return self;
+}
+static PyObject* PyMNNVar_iternext(PyObject *self) {
+    auto idx = ((PyMNNVar*)self)->iter_index++;
+    auto var = toVar(self);
+    auto conut = var->getInfo()->dim[0];
+    if (idx >= conut) return NULL;
+    return toPyObj(Express::_Gather(var, Express::_Scalar<int>(idx)));
 }
 // PyMNNVar basic functions impl
 static PyObject* PyMNNVar_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
    PyMNNVar* self = (PyMNNVar *)type->tp_alloc(type, 0);
+    self->iter_index = 0;
    self->var = nullptr;
    return (PyObject*)self;
 }
@ -505,7 +637,7 @@ static PyObject* PyMNNVar_getshape(PyMNNVar *self, void *closure) {
    if (self->var) {
        auto info = (*(self->var))->getInfo();
        if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getshape: unable to get variable info");
        }
        shape = toPyObj(info->dim);
    }
@ -524,7 +656,7 @@ static PyObject* PyMNNVar_getdata_format(PyMNNVar *self, void *closure) {
    if (self->var) {
        auto info = (*(self->var))->getInfo();
        if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getdata_format: unable to get variable info");
        }
        return toPyObj(info->order);
    }
@ -534,7 +666,7 @@ static PyObject* PyMNNVar_getdtype(PyMNNVar *self, void *closure) {
    if (self->var) {
        auto info = (*(self->var))->getInfo();
        if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getdtype: unable to get variable info");
        }
        return toPyObj(htype2dtype(info->type));
    }
@ -544,7 +676,7 @@ static PyObject* PyMNNVar_getsize(PyMNNVar *self, void *closure) {
    if (self->var) {
        auto info = (*(self->var))->getInfo();
        if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getsize: unable to get variable info");
        }
        return toPyObj(info->size);
    }
@ -564,7 +696,7 @@ PyObject *ndim = NULL;
    if (self->var) {
        auto info = (*(self->var))->getInfo();
        if(nullptr == info) {
-            PyMNN_ERROR("unable to get variable info");
+            PyMNN_ERROR("getndim: unable to get variable info");
        }
        ndim = toPyObj((int)info->dim.size());
    }
@ -685,13 +817,16 @@ static PyObject* PyMNNVar_resize(PyMNNVar *self, PyObject *args) {
 static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
    auto info = (*(self->var))->getInfo();
    if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("read: unable to get variable info");
    }
    auto dtype = htype2dtype(info->type);
    auto shape = info->dim;
    int64_t total_length = info->size;
    auto readptr = [self](DType dtype, INTS shape, int64_t total_length) {
        void *dataPtr = (void *) (*(self->var))->readMap<void>();
+        if (nullptr == dataPtr) {
+            PyMNN_ERROR("call to readMap meet a error");
+        }
        std::vector<npy_intp> npy_dims;
        for(const auto dim : shape) {
            npy_dims.push_back(dim);
@ -710,9 +845,6 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
            default:
                PyMNN_ERROR("does not support this dtype");
        }
-        if (nullptr == dataPtr) {
-            PyMNN_ERROR("call to readMap meet a error");
-        }
    };
    auto data = readptr(dtype, shape, total_length);
    (*(self->var))->unMap();
@ -722,13 +854,16 @@ static PyObject* PyMNNVar_read(PyMNNVar *self, PyObject *args) {
 static PyObject* PyMNNVar_read_as_tuple(PyMNNVar *self, PyObject *args) {
    auto info = (*(self->var))->getInfo();
    if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("read_as_tuple: unable to get variable info");
    }
    auto dtype = htype2dtype(info->type);
    auto shape = info->dim;
    size_t total_length = info->size;
    auto readptr = [self](DType dtype, INTS shape, size_t total_length) {
        void *dataPtr = (void *) (*(self->var))->readMap<void>();
+        if (nullptr == dataPtr) {
+            PyMNN_ERROR("call to readMap meet a error");
+        }
        auto obj = PyTuple_New(total_length);
        if(DType_FLOAT == dtype) {
            auto data = (float*)dataPtr;
@ -766,7 +901,7 @@ static PyObject* PyMNNVar_write(PyMNNVar *self, PyObject *args) {
    }
    auto info = (*(self->var))->getInfo();
    if(nullptr == info) {
-        PyMNN_ERROR("unable to get variable info");
+        PyMNN_ERROR("write: unable to get variable info");
    }
    auto dtype = htype2dtype(info->type);
    int64_t total_length = info->size;
@ -1042,11 +1177,15 @@ static PyObject* PyMNNExpr_const(PyObject *self, PyObject *args, PyObject *kwarg
            total_length *= shape[i];
        }
    }
-    auto data = toPtr(value, dtype, total_length);
    auto ret = getVar();
-    if(data) {
-        *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
-        free(data);
+    if (total_length > 0) {
+        auto data = toPtr(value, dtype, total_length);
+        if(data) {
+            *(ret->var) = _Const((const void*)data, shape, data_format, dtype2htype(dtype));
+            free(data);
+        }
+    } else {
+        *(ret->var) = _Const(nullptr, shape, data_format, dtype2htype(dtype));
    }
    return (PyObject *)ret;
 }
@ -1332,6 +1471,32 @@ static PyObject* PyMNNExpr_randomuniform(PyObject *self, PyObject *args) {
    }
    PyMNN_ERROR("randomuniform require args: (Var, dtype, |float, float, int, int)");
 }
+static PyObject* PyMNNExpr_sort(PyObject *self, PyObject *args) {
+    PyObject *x;
+    int axis = -1, arg = 0, descend = 0, bykey = -1;
+    if (PyArg_ParseTuple(args, "O|iii", &x, &axis, &arg, &descend) && isVar(x)) {
+        return toPyObj(Express::_Sort(toVar(x), axis, arg, descend));
+    }
+    PyMNN_ERROR("sort require args: (Var, |int, bool, bool)");
+}
+static PyObject* PyMNNExpr_raster(PyObject *self, PyObject *args) {
+    PyObject *var, *region, *shape;
+    if (PyArg_ParseTuple(args, "OOO", &var, &region, &shape) &&
+        isVars(var) && isInts(region) && isInts(shape)) {
+        return toPyObj(Express::_Raster(toVars(var), toInts(region), toInts(shape)));
+    }
+    PyMNN_ERROR("raster require args: ([Var], [int], [int])");
+}
+static PyObject* PyMNNExpr_nms(PyObject *self, PyObject *args) {
+    PyObject *boxes, *scores;
+    int max_detections;
+    float iou_threshold = -1.0, score_threshold = -1.0;
+    if (PyArg_ParseTuple(args, "OOi|ff", &boxes, &scores, &max_detections, &iou_threshold, &score_threshold) &&
+        isVar(boxes) && isVar(scores)) {
+        return toPyObj(Express::_Nms(toVar(boxes), toVar(scores), max_detections, iou_threshold, score_threshold));
+    }
+    PyMNN_ERROR("nms require args: (Var, Var, |float, float)");
+}
 static PyObject* PyMNNExpr_detection_post_process(PyObject *self, PyObject *args) {
    PyObject *encode_boxes, *class_predictions, *anchors, *centersize_encoding;
    int num_classes, max_detections, max_class_per_detection, detections_per_class;
@ -1508,6 +1673,9 @@ static PyMethodDef PyMNNExpr_methods[] = {
        zeros_like, "build zeros_like expr",
        unstack, "build unstack expr",
        range, "build range expr",
+        sort, "build sort expr",
+        raster, "build raster expr",
+        nms, "build nms expr",
        detection_post_process, "build detection_post_process expr"
    )
 };
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@ -1,4 +1,10 @@
 #include "util.h"
+#ifdef PYMNN_INTERNAL_SERVING
+#include <MNN/AutoTime.hpp>
+#include <MNN/MNNForwardType.h>
+#include "internal/monitor_service.h"
+#include "internal/verify_service.h"
+#endif

 // NN Module Start
 def_class_start(_Module, Module)
@ -19,6 +25,37 @@ def_class_methods(_Module,
    _add_parameter, "add parameter"
 )
 def_class_end(_Module, Module)
+
+static PyObject* load_module(PyObject *inputs, PyObject *outputs, PyObject *backend, PyObject *memory_mode,
+                             PyObject *power_mode, PyObject *precision_mode, const char* file_name, int dynamic,
+                             int shape_mutable, int rearrange, int thread_num) {
+
+    BackendConfig backend_config;
+    backend_config.memory = toEnum<MemoryMode>(memory_mode);
+    backend_config.power = toEnum<PowerMode>(power_mode);
+    backend_config.precision = toEnum<PrecisionMode>(precision_mode);
+
+    Module::BackendInfo backend_info;
+    backend_info.type = toEnum<MNNForwardType>(backend);
+    backend_info.config = &backend_config;
+
+    Module::Config config;
+    config.dynamic = dynamic;
+    config.shapeMutable = shape_mutable;
+    config.rearrange = rearrange;
+    config.backend = &backend_info;
+
+    auto converted_file_name = convertBytesEncodeIfNeed(file_name);
+    auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
+    if (m_ptr == nullptr) {
+        std::string mnn_errno = "load_module_from_file failed ";
+        mnn_errno = mnn_errno + std::string(file_name);
+        PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
+    }
+
+    return toPyObj(m_ptr);
+}
+
 static PyObject* PyMNN_Module_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
    PyMNN_Module *self = (PyMNN_Module *)type->tp_alloc(type, 0);
    self->ptr = Module::createEmpty({});
@ -50,10 +87,31 @@ static PyObject* PyMNN_Module_forward(PyMNN_Module *self, PyObject *args) {
        Py_RETURN_NONE;
    }
    if (isVars(input)) {
+#ifdef PYMNN_INTERNAL_SERVING
+        int status = 0;
+        Timer timer;
+        auto vars = self->ptr->onForward(toVars(input));
+        if (vars.empty()) {
+            PyMNN_ERROR("module onForward occur error.");
+            status = -1;
+        }
+
+        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
+        return toPyObj<VARP, toPyObj>(vars);
+#else
        return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(input)));
+#endif
    }
    if (isVar(input)) {
+#ifdef PYMNN_INTERNAL_SERVING
+        int status = 0;
+        Timer timer;
+        auto var = self->ptr->forward(toVar(input));
+        (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_forward");
+        return toPyObj(var);
+#else
        return toPyObj(self->ptr->forward(toVar(input)));
+#endif
    }
    PyMNN_ERROR("PyMNN_Module_forward: args must be Var/[Var].");
 }
@ -62,8 +120,22 @@ static PyObject* PyMNN_Module_onForward(PyMNN_Module *self, PyObject *args) {
    if (!PyArg_ParseTuple(args, "O", &inputs)) {
        Py_RETURN_NONE;
    }
+#ifdef PYMNN_INTERNAL_SERVING
+    int status = 0;
+    Timer timer;
+    auto vars = self->ptr->onForward(toVars(inputs));
+    if (vars.empty()) {
+        PyMNN_ERROR("module onForward occur error.");
+        status = -1;
+    }
+
+    (void) MonitorService::GetInstance().EventTrack(self->ptr, timer, status, "PyMNN_Module_onForward");
+    return toPyObj<VARP, toPyObj>(vars);
+#else
    return toPyObj<VARP, toPyObj>(self->ptr->onForward(toVars(inputs)));
+#endif
 }
+
 static PyObject* PyMNN_Module_set_name(PyMNN_Module *self, PyObject *args) {
    const char* name;
    if (!PyArg_ParseTuple(args, "s", &name)) {
@ -125,6 +197,11 @@ static PyObject* PyMNNNN_load_module(PyObject *self, PyObject *args) {
    return toPyObj(m);
 }
 static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
+#ifdef PYMNN_INTERNAL_SERVING
+    PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file: unsupported interface, should use load_module_from_file_with_token.");
+    return NULL;
+#endif
    PyObject *inputs, *outputs, *backend, *memory_mode, *power_mode, *precision_mode;
    const char* file_name;
    int dynamic, shape_mutable, rearrange;
@ -135,30 +212,54 @@ static PyObject* PyMNNNN_load_module_from_file(PyObject *self, PyObject *args) {
        printf("PyArg_ParseTuple Error\n");
        return NULL;
    }
-    BackendConfig backend_config;
-    backend_config.memory = toEnum<MemoryMode>(memory_mode);
-    backend_config.power = toEnum<PowerMode>(power_mode);
-    backend_config.precision = toEnum<PrecisionMode>(precision_mode);

-    Module::BackendInfo backend_info;
-    backend_info.type = toEnum<MNNForwardType>(backend);
-    backend_info.config = &backend_config;
-
-    Module::Config config;
-    config.dynamic = dynamic;
-    config.shapeMutable = shape_mutable;
-    config.rearrange = rearrange;
-    config.backend = &backend_info;
-
-    auto converted_file_name = convertBytesEncodeIfNeed(file_name);
-    auto m_ptr = Module::load(toStrings(inputs), toStrings(outputs), converted_file_name.data(), &config);
-    if (m_ptr == nullptr) {
-        std::string mnn_errno = "load_module_from_file failed ";
-        mnn_errno = mnn_errno + std::string(file_name);
-        PyErr_SetString(PyExc_Exception, mnn_errno.c_str());
-    }
-    return toPyObj(m_ptr);
+    return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
+     shape_mutable,  rearrange,  thread_num);
 }
+
+#ifdef PYMNN_INTERNAL_SERVING
+static PyObject* PyMNNNN_load_module_from_file_with_token(PyObject *self, PyObject *args) {
+    PyObject *inputs, *outputs;
+    const char* file_name;
+    PyObject *backend = toPyObj(MNN_FORWARD_CPU);
+    PyObject *memory_mode = toPyObj(MemoryMode::Memory_Normal);
+    PyObject *power_mode = toPyObj(PowerMode::Power_Normal);;
+    PyObject *precision_mode = toPyObj(PrecisionMode::Precision_Normal);;
+    int dynamic = 0;
+    int shape_mutable = 0;
+    int rearrange = 0;
+    char *token = NULL;
+    char *scene = NULL;
+    char *app_key = NULL;
+    int thread_num = 1;
+    if (!PyArg_ParseTuple(args, "OOssss|iiiOOOOi", &inputs, &outputs, &file_name, &token, &scene, &app_key, &dynamic,
+                          &shape_mutable, &rearrange, &backend, &memory_mode, &power_mode, &precision_mode,
+                          &thread_num)) {
+        printf("PyArg_ParseTuple Error\n");
+        return NULL;
+    }
+
+    if (!token || !scene || !app_key) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: input invalid, token, scene or app_key is null.");
+        return NULL;
+    }
+
+    MonitorService::GetInstance().Start();
+    VerifyService::GetInstance().Start();
+    bool ret = VerifyService::GetInstance().VerifyToken(std::string(token), std::string(scene), std::string(app_key));
+    if (!ret) {
+        PyErr_SetString(PyExc_Exception,
+                        "PyMNNNN_load_module_from_file_with_token: check token failed, return null module.");
+        return NULL;
+    }
+
+    return load_module(inputs, outputs, backend, memory_mode, power_mode, precision_mode, file_name, dynamic,
+     shape_mutable,  rearrange,  thread_num);
+
+}
+#endif
+
 #ifdef PYMNN_TRAIN_API
 static PyObject* PyMNNNN_conv(PyObject *self, PyObject *args) {
    INTS default_1 = {1, 1}, default_0 = {0, 0};
@ -221,10 +322,18 @@ static PyObject* PyMNNNN_dropout(PyObject *self, PyObject *args) {
 }
 #endif
 static PyMethodDef PyMNNNN_methods[] = {
+#ifdef PYMNN_INTERNAL_SERVING
+    register_methods(NN,
+        load_module, "load_module([Var], [Var], bool)",
+        load_module_from_file_with_token, "load_module_from_file_with_token([string], [string], filename, bool, ...)",
+        load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
+    )
+#else
    register_methods(NN,
        load_module, "load_module([Var], [Var], bool)",
        load_module_from_file, "load_module_from_file([string], [string], filename, bool, ...)"
    )
+#endif
 #ifdef PYMNN_TRAIN_API
    register_methods(NN,        
        conv, "conv Module",
@ -234,4 +343,4 @@ static PyMethodDef PyMNNNN_methods[] = {
    )
 #endif
 };
-// NN Module End
+// NN Module End
--- a/pymnn/src/util.h
+++ b/pymnn/src/util.h
@ -225,13 +225,16 @@ inline int getnpysize(int npy_type) {
        return 4;
      case NPY_DOUBLE:
        return 8;
-      case NPY_INT:
-        return 4;
      case NPY_INT64:
        return 8;
      case NPY_UINT8:
        return 1;
      default:
+        // NPY_INT(np.int) and NPY_INT32(np.int32) may be different enum on some platform
+        // use `if` instead of `switch case`(when NPY_INT is same as NPY_INT32, two same case value is not support)
+        if (npy_type == NPY_INT || npy_type == NPY_INT32) {
+            return 4;
+        }
        PyMNN_ERROR_LOG("does not support this npy_type");
        return 0;
    }
@ -249,7 +252,7 @@ inline int getitemsize(int dtype, int npy_type) {
        }
        return 8;
      case DType_INT32:
-        if(npy_type != NPY_INT) {
+        if(npy_type != NPY_INT && npy_type != NPY_INT32) {
          PyMNN_ERROR_LOG("numpy type does not match");
        }
        return 4;
@ -383,7 +386,7 @@ static bool isVec(PyObject* obj) {
            return Func(PyList_GetItem(obj, 0));
        } else return true;
    }
-    return false;
+    return Func(obj);
 }
 static inline bool isInts(PyObject* obj) {
    return isInt(obj) || isVec<isInt>(obj);
@ -438,6 +441,7 @@ static vector<T> toVec(PyObject* obj) {
        }
        return values;
    }
+    values.push_back(Func(obj));
    return values;
 }
 static inline std::vector<int> toInts(PyObject* obj) {
@ -586,188 +590,185 @@ static void* toPtr(PyObject *obj, DType dtype, int64_t& total_length, void* data
 // just support COND = 0 or 1
 #define arg_if(COND, THEN, ELSE) arg_concat(arg_if_, COND)(THEN, ELSE)
 #define expand_item_0(...)
-#define expand_item_1(macro, context, key, value, ITEMS...) \
+#define expand_item_1(macro, context, key, value, ...) \
    macro(context, key, value)
-#define expand_item_2(macro, context, key, value, ITEMS...) \
+#define expand_item_2(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_1(macro, context, ITEMS)
-#define expand_item_3(macro, context, key, value, ITEMS...) \
+    expand_item_1(macro, context, __VA_ARGS__)
+#define expand_item_3(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_2(macro, context, ITEMS)
-#define expand_item_4(macro, context, key, value, ITEMS...) \
+    expand_item_2(macro, context, __VA_ARGS__)
+#define expand_item_4(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_3(macro, context, ITEMS)
-#define expand_item_5(macro, context, key, value, ITEMS...) \
+    expand_item_3(macro, context, __VA_ARGS__)
+#define expand_item_5(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_4(macro, context, ITEMS)
-#define expand_item_6(macro, context, key, value, ITEMS...) \
+    expand_item_4(macro, context, __VA_ARGS__)
+#define expand_item_6(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_5(macro, context, ITEMS)
-#define expand_item_7(macro, context, key, value, ITEMS...) \
+    expand_item_5(macro, context, __VA_ARGS__)
+#define expand_item_7(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_6(macro, context, ITEMS)
-#define expand_item_8(macro, context, key, value, ITEMS...) \
+    expand_item_6(macro, context, __VA_ARGS__)
+#define expand_item_8(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_7(macro, context, ITEMS)
-#define expand_item_9(macro, context, key, value, ITEMS...) \
+    expand_item_7(macro, context, __VA_ARGS__)
+#define expand_item_9(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_8(macro, context, ITEMS)
-#define expand_item_10(macro, context, key, value, ITEMS...) \
+    expand_item_8(macro, context, __VA_ARGS__)
+#define expand_item_10(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_9(macro, context, ITEMS)
-#define expand_item_11(macro, context, key, value, ITEMS...) \
+    expand_item_9(macro, context, __VA_ARGS__)
+#define expand_item_11(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_10(macro, context, ITEMS)
-#define expand_item_12(macro, context, key, value, ITEMS...) \
+    expand_item_10(macro, context, __VA_ARGS__)
+#define expand_item_12(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_11(macro, context, ITEMS)
-#define expand_item_13(macro, context, key, value, ITEMS...) \
+    expand_item_11(macro, context, __VA_ARGS__)
+#define expand_item_13(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_12(macro, context, ITEMS)
-#define expand_item_14(macro, context, key, value, ITEMS...) \
+    expand_item_12(macro, context, __VA_ARGS__)
+#define expand_item_14(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_13(macro, context, ITEMS)
-#define expand_item_15(macro, context, key, value, ITEMS...) \
+    expand_item_13(macro, context, __VA_ARGS__)
+#define expand_item_15(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_14(macro, context, ITEMS)
-#define expand_item_16(macro, context, key, value, ITEMS...) \
+    expand_item_14(macro, context, __VA_ARGS__)
+#define expand_item_16(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_15(macro, context, ITEMS)
-#define expand_item_17(macro, context, key, value, ITEMS...) \
+    expand_item_15(macro, context, __VA_ARGS__)
+#define expand_item_17(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_16(macro, context, ITEMS)
-#define expand_item_18(macro, context, key, value, ITEMS...) \
+    expand_item_16(macro, context, __VA_ARGS__)
+#define expand_item_18(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_17(macro, context, ITEMS)
-#define expand_item_19(macro, context, key, value, ITEMS...) \
+    expand_item_17(macro, context, __VA_ARGS__)
+#define expand_item_19(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_18(macro, context, ITEMS)
-#define expand_item_20(macro, context, key, value, ITEMS...) \
+    expand_item_18(macro, context, __VA_ARGS__)
+#define expand_item_20(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_19(macro, context, ITEMS)
-#define expand_item_21(macro, context, key, value, ITEMS...) \
+    expand_item_19(macro, context, __VA_ARGS__)
+#define expand_item_21(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_20(macro, context, ITEMS)
-#define expand_item_22(macro, context, key, value, ITEMS...) \
+    expand_item_20(macro, context, __VA_ARGS__)
+#define expand_item_22(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_21(macro, context, ITEMS)
-#define expand_item_23(macro, context, key, value, ITEMS...) \
+    expand_item_21(macro, context, __VA_ARGS__)
+#define expand_item_23(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_22(macro, context, ITEMS)
-#define expand_item_24(macro, context, key, value, ITEMS...) \
+    expand_item_22(macro, context, __VA_ARGS__)
+#define expand_item_24(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_23(macro, context, ITEMS)
-#define expand_item_24(macro, context, key, value, ITEMS...) \
+    expand_item_23(macro, context, __VA_ARGS__)
+#define expand_item_25(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_23(macro, context, ITEMS)
-#define expand_item_25(macro, context, key, value, ITEMS...) \
+    expand_item_24(macro, context, __VA_ARGS__)
+#define expand_item_26(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_24(macro, context, ITEMS)
-#define expand_item_26(macro, context, key, value, ITEMS...) \
+    expand_item_25(macro, context, __VA_ARGS__)
+#define expand_item_27(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_25(macro, context, ITEMS)
-#define expand_item_27(macro, context, key, value, ITEMS...) \
+    expand_item_26(macro, context, __VA_ARGS__)
+#define expand_item_28(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_26(macro, context, ITEMS)
-#define expand_item_28(macro, context, key, value, ITEMS...) \
+    expand_item_27(macro, context, __VA_ARGS__)
+#define expand_item_29(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_27(macro, context, ITEMS)
-#define expand_item_29(macro, context, key, value, ITEMS...) \
+    expand_item_28(macro, context, __VA_ARGS__)
+#define expand_item_30(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_28(macro, context, ITEMS)
-#define expand_item_30(macro, context, key, value, ITEMS...) \
+    expand_item_29(macro, context, __VA_ARGS__)
+#define expand_item_31(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_29(macro, context, ITEMS)
-#define expand_item_31(macro, context, key, value, ITEMS...) \
+    expand_item_30(macro, context, __VA_ARGS__)
+#define expand_item_32(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_30(macro, context, ITEMS)
-#define expand_item_32(macro, context, key, value, ITEMS...) \
+    expand_item_31(macro, context, __VA_ARGS__)
+#define expand_item_33(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_31(macro, context, ITEMS)
-#define expand_item_33(macro, context, key, value, ITEMS...) \
+    expand_item_32(macro, context, __VA_ARGS__)
+#define expand_item_34(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_32(macro, context, ITEMS)
-#define expand_item_34(macro, context, key, value, ITEMS...) \
+    expand_item_33(macro, context, __VA_ARGS__)
+#define expand_item_35(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_33(macro, context, ITEMS)
-#define expand_item_35(macro, context, key, value, ITEMS...) \
+    expand_item_34(macro, context, __VA_ARGS__)
+#define expand_item_36(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_34(macro, context, ITEMS)
-#define expand_item_36(macro, context, key, value, ITEMS...) \
+    expand_item_35(macro, context, __VA_ARGS__)
+#define expand_item_37(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_35(macro, context, ITEMS)
-#define expand_item_37(macro, context, key, value, ITEMS...) \
+    expand_item_36(macro, context, __VA_ARGS__)
+#define expand_item_38(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_36(macro, context, ITEMS)
-#define expand_item_38(macro, context, key, value, ITEMS...) \
+    expand_item_37(macro, context, __VA_ARGS__)
+#define expand_item_39(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_37(macro, context, ITEMS)
-#define expand_item_39(macro, context, key, value, ITEMS...) \
+    expand_item_38(macro, context, __VA_ARGS__)
+#define expand_item_40(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_38(macro, context, ITEMS)
-#define expand_item_40(macro, context, key, value, ITEMS...) \
+    expand_item_39(macro, context, __VA_ARGS__)
+#define expand_item_41(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_39(macro, context, ITEMS)
-#define expand_item_41(macro, context, key, value, ITEMS...) \
+    expand_item_40(macro, context, __VA_ARGS__)
+#define expand_item_42(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_40(macro, context, ITEMS)
-#define expand_item_42(macro, context, key, value, ITEMS...) \
+    expand_item_41(macro, context, __VA_ARGS__)
+#define expand_item_43(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_41(macro, context, ITEMS)
-#define expand_item_43(macro, context, key, value, ITEMS...) \
+    expand_item_42(macro, context, __VA_ARGS__)
+#define expand_item_44(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_42(macro, context, ITEMS)
-#define expand_item_44(macro, context, key, value, ITEMS...) \
+    expand_item_43(macro, context, __VA_ARGS__)
+#define expand_item_45(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_43(macro, context, ITEMS)
-#define expand_item_45(macro, context, key, value, ITEMS...) \
+    expand_item_44(macro, context, __VA_ARGS__)
+#define expand_item_46(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_44(macro, context, ITEMS)
-#define expand_item_46(macro, context, key, value, ITEMS...) \
+    expand_item_45(macro, context, __VA_ARGS__)
+#define expand_item_47(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_45(macro, context, ITEMS)
-#define expand_item_47(macro, context, key, value, ITEMS...) \
+    expand_item_46(macro, context, __VA_ARGS__)
+#define expand_item_48(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_46(macro, context, ITEMS)
-#define expand_item_48(macro, context, key, value, ITEMS...) \
+    expand_item_47(macro, context, __VA_ARGS__)
+#define expand_item_49(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_47(macro, context, ITEMS)
-#define expand_item_49(macro, context, key, value, ITEMS...) \
+    expand_item_48(macro, context, __VA_ARGS__)
+#define expand_item_50(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_48(macro, context, ITEMS)
-#define expand_item_50(macro, context, key, value, ITEMS...) \
+    expand_item_49(macro, context, __VA_ARGS__)
+#define expand_item_51(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_49(macro, context, ITEMS)
-#define expand_item_51(macro, context, key, value, ITEMS...) \
+    expand_item_50(macro, context, __VA_ARGS__)
+#define expand_item_52(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_50(macro, context, ITEMS)
-#define expand_item_52(macro, context, key, value, ITEMS...) \
+    expand_item_51(macro, context, __VA_ARGS__)
+#define expand_item_53(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_51(macro, context, ITEMS)
-#define expand_item_53(macro, context, key, value, ITEMS...) \
+    expand_item_52(macro, context, __VA_ARGS__)
+#define expand_item_54(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_52(macro, context, ITEMS)
-#define expand_item_54(macro, context, key, value, ITEMS...) \
+    expand_item_53(macro, context, __VA_ARGS__)
+#define expand_item_55(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_53(macro, context, ITEMS)
-#define expand_item_55(macro, context, key, value, ITEMS...) \
+    expand_item_54(macro, context, __VA_ARGS__)
+#define expand_item_56(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_54(macro, context, ITEMS)
-#define expand_item_56(macro, context, key, value, ITEMS...) \
+    expand_item_55(macro, context, __VA_ARGS__)
+#define expand_item_57(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_55(macro, context, ITEMS)
-#define expand_item_57(macro, context, key, value, ITEMS...) \
+    expand_item_56(macro, context, __VA_ARGS__)
+#define expand_item_58(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_56(macro, context, ITEMS)
-#define expand_item_58(macro, context, key, value, ITEMS...) \
+    expand_item_57(macro, context, __VA_ARGS__)
+#define expand_item_59(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_57(macro, context, ITEMS)
-#define expand_item_59(macro, context, key, value, ITEMS...) \
+    expand_item_58(macro, context, __VA_ARGS__)
+#define expand_item_60(macro, context, key, value, ...) \
    macro(context, key, value) \
-    expand_item_58(macro, context, ITEMS)
-#define expand_item_60(macro, context, key, value, ITEMS...) \
-    macro(context, key, value) \
-    expand_item_59(macro, context, ITEMS)
+    expand_item_59(macro, context, __VA_ARGS__)
 #define expand_items(macro, context, ...) \
    arg_concat(expand_item_, arg_half_size(__VA_ARGS__))(macro, context, __VA_ARGS__)
 //------------------------ macro_utils end -------------------------
@ -790,18 +791,6 @@ static PyObject* PyEnum_new(struct _typeobject *type, PyObject *args, PyObject *
 Py_hash_t PyEnum_hash(PyObject* x) {
    return static_cast<Py_hash_t>(((PyMNNEnum*)x)->value);
 }
-PyObject *PyEnum_richcompare(PyObject *self, PyObject *other, int op) {
-    int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value;
-    switch (op) {
-        case Py_LT: return toPyObj(l < r);
-        case Py_LE: return toPyObj(l <= r);
-        case Py_EQ: return toPyObj(l == r);
-        case Py_NE: return toPyObj(l != r);
-        case Py_GT: return toPyObj(l > r);
-        case Py_GE: return toPyObj(l >= r);
-    }
-    Py_RETURN_NONE;
-}
 static PyObject* toPyEnum(PyObject* type, int val) {
    auto args = PyTuple_New(1);
    PyTuple_SetItem((PyObject*)args, 0, PyLong_FromLong((long)val));
@ -825,11 +814,11 @@ static T toEnum(PyObject* e) {
    PyObject_SetAttrString(scope, value, toPyObj(key)); \
    PyDict_SetItemString(dict, value, toPyObj(key));

-#define def_enum_repr(NAME, ITEMS...) \
+#define def_enum_repr(NAME, ...) \
 static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \
    std::string str = #NAME "."; \
    std::map<int, const char*> items = { \
-        expand_items(declare_map_item, _, ITEMS) \
+        expand_items(declare_map_item, _, __VA_ARGS__) \
    }; \
    int key = ((PyMNNEnum*)self)->value; \
    auto iter = items.find(key); \
@ -839,22 +828,23 @@ static PyObject* PyEnum_##NAME##_repr(PyObject *self) { \

 #define def_enum_to(NAME, TYPE) \
 static PyObject* toPyObj(TYPE value) { \
-    return toPyEnum((PyObject*)&PyEnum_##NAME, static_cast<int>(value)); \
+    return toPyEnum((PyObject*)PyType_FindTLSType(&PyEnum_##NAME), static_cast<int>(value)); \
 }

-#define def_enum_register(NAME, ITEMS...) \
+#define def_enum_register(NAME, ...) \
 static void def_##NAME(PyObject *scope) { \
-    if (PyType_Ready(&PyEnum_##NAME) < 0) { \
+    if (PyType_Ready(PyType_FindTLSType(&PyEnum_##NAME)) < 0) { \
        PyErr_SetString(PyExc_Exception, "init " #NAME ": PyType_Ready failed"); \
    } \
-    PyObject* self = (PyObject *)&PyEnum_##NAME; \
+    PyObject* self = (PyObject *)PyType_FindTLSType(&PyEnum_##NAME); \
    PyObject* dict = PyEnum_##NAME.tp_dict; \
    PyModule_AddObject(scope, #NAME, self); \
-    expand_items(register_item, NAME, ITEMS) \
+    expand_items(register_item, NAME, __VA_ARGS__) \
 }

-#define def_enum(NAME, TYPE, ITEMS...) \
-def_enum_repr(NAME, ITEMS) \
+#define def_enum(NAME, TYPE, ...) \
+def_enum_repr(NAME, __VA_ARGS__) \
+PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op); \
 static PyTypeObject PyEnum_##NAME = { \
    PyVarObject_HEAD_INIT(NULL, 0) \
    #NAME,                                    /*tp_name*/\
@ -879,7 +869,7 @@ static PyTypeObject PyEnum_##NAME = { \
    "PyMNNEnum",                              /*tp_doc*/\
    0,                                        /*tp_traverse*/\
    0,                                        /*tp_clear*/\
-    &PyEnum_richcompare,                      /*tp_richcompare*/\
+    &PyEnum_##NAME##richcompare,              /*tp_richcompare*/\
    0,                                        /*tp_weaklistoffset*/\
    0,                                        /*tp_iter*/\
    0,                                        /*tp_iternext*/\
@ -895,9 +885,22 @@ static PyTypeObject PyEnum_##NAME = { \
    0,                                        /*tp_alloc*/\
    PyEnum_new                                /*tp_new*/\
 };\
-static inline bool is##NAME(PyObject* obj) { return PyObject_IsInstance(obj, (PyObject*)&PyEnum_##NAME); } \
+static inline bool is##NAME(PyObject* obj) { return Py_TYPE(obj) == PyType_FindTLSType(&PyEnum_##NAME); } \
+PyObject *PyEnum_##NAME##richcompare(PyObject *self, PyObject *other, int op) { \
+    if (!is##NAME(other)) Py_RETURN_FALSE; \
+    int l = ((PyMNNEnum*)self)->value, r = ((PyMNNEnum*)other)->value; \
+    switch (op) { \
+        case Py_LT: return toPyObj(l < r); \
+        case Py_LE: return toPyObj(l <= r); \
+        case Py_EQ: return toPyObj(l == r); \
+        case Py_NE: return toPyObj(l != r); \
+        case Py_GT: return toPyObj(l > r); \
+        case Py_GE: return toPyObj(l >= r); \
+    } \
+    Py_RETURN_FALSE; \
+} \
 def_enum_to(NAME, TYPE) \
-def_enum_register(NAME, ITEMS)
+def_enum_register(NAME, __VA_ARGS__)
 // ------------------------ enum end --------------------------
 // ------------------------ func start ------------------------
 #define def_methods(MODULE, NAME) \
@ -996,10 +999,10 @@ static PyObject* PyMNN##SCOPE##_##NAME(PyObject *self, PyObject *args) { \

 #define def_class_register(NAME) \
 static void def_##NAME(PyObject *scope) { \
-    if (PyType_Ready(&PyMNN##NAME##Type) < 0) { \
+    if (PyType_Ready(PyType_FindTLSType(&PyMNN##NAME##Type)) < 0) { \
        PyErr_SetString(PyExc_Exception, "init" #NAME ": PyType_Ready PyMNN" #NAME "Type failed"); \
    } \
-    PyObject* self = (PyObject *)&PyMNN##NAME##Type; \
+    PyObject* self = (PyObject *)PyType_FindTLSType(&PyMNN##NAME##Type); \
    PyModule_AddObject(scope, #NAME, self); \
 }

@ -1071,7 +1074,7 @@ static PyTypeObject PyMNN##NAME##Type = { \
 };\
 def_class_register(NAME) \
 static PyMNN##NAME* get##NAME() { \
-    return (PyMNN##NAME *)PyObject_Call((PyObject*)&PyMNN##NAME##Type, PyTuple_New(0), NULL); \
+    return (PyMNN##NAME *)PyObject_Call((PyObject*)PyType_FindTLSType(&PyMNN##NAME##Type), PyTuple_New(0), NULL); \
 } \
 static PyObject* toPyObj(TYPE* x) { \
    auto ret = get##NAME(); \
--- a/pymnn/test/model_test.py
+++ b/pymnn/test/model_test.py
@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 import os
 import sys
 import MNN
@ -10,7 +11,11 @@ def parseConfig(root_dir):
    configName = os.path.join(root_dir, 'config.txt')
    if not os.path.exists(configName):
        return False
-    config = open(configName, 'rt')
+    try:
+        config = open(configName, 'rt', encoding='utf-8')
+    except:
+        import io
+        config = io.open(configName, 'rt', encoding='utf-8')
    res = {}
    res['model_name'] = os.path.join(root_dir, 'temp.bin')
    for line in config.readlines():
--- a/pymnn/test/unit_test.py
+++ b/pymnn/test/unit_test.py
@ -465,6 +465,14 @@ class UnitTest(unittest.TestCase):
        self.assertEqualVar(expr.range(start, limit, delta), np.arange(0.0, 2.0, 0.3))
    def test_depth_to_space(self):
        self.assertEqualVar(expr.depth_to_space(self.x, 2), torch.pixel_shuffle(self._x, 2))
+    def test_sort(self):
+        x = mp.array([5, -1, 2, 0])
+        x_ = np.array([5, -1, 2, 0])
+        self.assertEqualVar(expr.sort(x), np.sort(x_))
+    def test_raster(self):
+        x = mp.array([[1, 2], [3, 4]])
+        x_ = np.array([[1, 2], [3, 4]])
+        self.assertEqualVar(expr.raster([x], [0, 1, 1, 2, 0, 1, 2, 1, 1, 2, 2], [2, 2]), x_.transpose())
    def test_detection_post_process(self):
        pass
    # test cv
@ -643,6 +651,40 @@ class UnitTest(unittest.TestCase):
        x = cv.threshold(self.imgf, 50, 20, cv.THRESH_BINARY)
        y = cv2.threshold(self.imgf_, 50, 20, cv2.THRESH_BINARY)[1]
        self.assertEqualImg(x, y)
+    # draw
+    def test_Draw(self):
+        x = self.img.copy()
+        y = self.img_.copy()
+        # 1. arrowedLine
+        cv.arrowedLine(x, [10, 10], [40, 40], [255, 0, 0])
+        cv2.arrowedLine(y, [10, 10], [40, 40], [255, 0, 0])
+        # 2. line
+        cv.line(x, [20, 30], [50, 60], [0, 0, 255])
+        cv2.line(y, [20, 30], [50, 60], [0, 0, 255])
+        # 3. circle
+        cv.circle(x, [70, 70], 30, [0, 255, 0])
+        cv2.circle(y, [70, 70], 30, [0, 255, 0])
+        # 4. rectangle
+        cv.rectangle(x, [80, 80], [120, 120], [0, 0, 255])
+        cv2.rectangle(y, [80, 80], [120, 120], [0, 0, 255])
+        # get contours
+        y_ = cv2.cvtColor(y, cv2.COLOR_BGR2GRAY)
+        y_ = cv2.threshold(y_, 127, 255, cv2.THRESH_BINARY)[1]
+        c_, _ = cv2.findContours(y_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        c = []
+        for a in c_:
+            ps = []
+            for b in a:
+                ps.append(int(b[0,0]))
+                ps.append(int(b[0,1]))
+            c.append(ps)
+        # 5. fillPoly
+        cv.fillPoly(x, c, [255, 0, 0])
+        cv2.fillPoly(y, c_, [255, 0, 0])
+        # 6. drawContours
+        cv.drawContours(x, c, -1, [0, 0, 255])
+        cv2.drawContours(y, c_, -1, [0, 0, 255])
+        self.assertEqualImg(x, y)
    # structural
    def test_Structural(self):
        x  = mp.array([[0,0,0,0,0,0,0,0,0,0,0,0,0],
@ -661,17 +703,20 @@ class UnitTest(unittest.TestCase):
        contours_, _ = cv2.findContours(x_, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
        contour = contours[0]
        contour_ = contours_[0]
-        self.assertEqualPoints(contour, contour_)
+        self.assertEqualVar(contour, contour_)
        self.assertEqual(cv.contourArea(contour), cv2.contourArea(contour_))
        hull = cv.convexHull(contour)
        hull_ = cv2.convexHull(contour_)
-        self.assertEqualPoints(hull, hull_)
+        if version_info.major < 3: hull_ = np.concatenate([hull_[-1::, :], hull_[:-1,:]])
+        self.assertEqualVar(hull, hull_)
        rect = cv.minAreaRect(contour)
        rect_ = cv2.minAreaRect(contour_)
-        self.assertEqual(rect, rect_)
-        points = cv.boxPoints(rect),
+        if version_info.major >= 3:
+            self.assertEqual(rect, rect_)
+        points = cv.boxPoints(rect)
        points_ = cv2.boxPoints(rect_)
-        self.assertEqualPoints(points, points_)
+        if version_info.major >= 3:
+            self.assertEqualVar(points, points_)
        self.assertEqual(tuple(cv.boundingRect(contour)), cv2.boundingRect(contour_))
        ret, labels, statsv, centroids = cv.connectedComponentsWithStats(x)
        ret_, labels_, statsv_, centroids_ = cv2.connectedComponentsWithStats(x_)
@ -689,6 +734,16 @@ class UnitTest(unittest.TestCase):
        x = cv.hconcat([self.img, self.img])
        y = cv2.hconcat([self.img_, self.img_])
        self.assertEqualImg(x, y)
+    def test_rotate(self):
+        x = cv.rotate(self.img, cv.ROTATE_90_CLOCKWISE)
+        y = cv2.rotate(self.img_, cv2.ROTATE_90_CLOCKWISE)
+        self.assertEqualImg(x, y)
+        x = cv.rotate(self.img, cv.ROTATE_180)
+        y = cv2.rotate(self.img_, cv2.ROTATE_180)
+        self.assertEqualImg(x, y)
+        x = cv.rotate(self.img, cv.ROTATE_90_COUNTERCLOCKWISE)
+        y = cv2.rotate(self.img_, cv2.ROTATE_90_COUNTERCLOCKWISE)
+        self.assertEqualImg(x, y)
    # numpy
    def test_from_shape_or_value(self):
        x = mp.zeros([2, 2])
@ -724,6 +779,9 @@ class UnitTest(unittest.TestCase):
        self.assertEqualVar(mp.linspace(2.0, 3.0, num=5, endpoint=False), np.linspace(2.0, 3.0, num=5, endpoint=False))
        self.assertEqualVar(mp.logspace(2.0, 3.0, num=4, endpoint=False), np.logspace(2.0, 3.0, num=4, endpoint=False))
        self.assertEqualVar(mp.geomspace(1, 1000, num=4, endpoint=False), np.geomspace(1, 1000, num=4, endpoint=False))
+        x = mp.arange(-5, 5., 0.1)
+        y = np.arange(-5, 5., 0.1)
+        self.assertEqualVars(mp.meshgrid(x, x), np.meshgrid(y, y))
    def test_changing_array_shape(self):
        x = mp.zeros((3, 2))
        x_ = np.zeros((3, 2))
@ -916,6 +974,11 @@ class UnitTest(unittest.TestCase):
        self.assertEqualShape(mp.random.randn(2,3).shape, np.random.randn(2,3).shape)
        self.assertEqualShape(mp.random.rand(3,2).shape, np.random.rand(3,2).shape)
        self.assertEqualShape(mp.random.randint(0, 2, [2,3]).shape, np.random.randint(0, 2, [2,3]).shape)
+    def test_sorting(self):
+        x = mp.array([[1,0,3], [0,6,5]])
+        x_ = np.array([[1,0,3], [0,6,5]])
+        self.assertEqualVar(mp.sort(x), np.sort(x_))
+        self.assertEqualVar(mp.argsort(x), np.argsort(x_))
    def test_searching_counting(self):
        x = mp.array([[1,0,3], [0,6,5]])
        x_ = np.array([[1,0,3], [0,6,5]])
@ -980,10 +1043,12 @@ class UnitTest(unittest.TestCase):
        self.assertAlmostEqual(x.var(), x_.var())
        self.assertEqualVar(x.var(0), x_.var(0))
        self.assertEqual(len(x), len(x_))
-        self.assertEqual(x[0,1].read_as_tuple()[0], x_[0,1])
+        self.assertEqual(x[0,1], x_[0,1])
        self.assertEqualVar(x[0], x_[0])
        self.assertEqualVar(x[:], x_[:])
        self.assertEqualVar(x[:1], x_[:1])
        self.assertEqualVar(x[::-1], x_[::-1])
+        self.assertEqualVar(x[x > 2], x_[x_ > 2])
+        self.assertEqualVar(x[mp.array([1])], x_[np.array([1])])
 if __name__ == '__main__':
    unittest.main()
--- a/schema/current/UserDefine_generated.h
+++ b/schema/current/UserDefine_generated.h
@ -376,13 +376,15 @@ struct ImageProcessParamT : public flatbuffers::NativeTable {
  int8_t paddingValue;
  std::vector<int32_t> shape;
  DataType outputType;
+  bool draw;
  ImageProcessParamT()
      : filterType(FilterType_NEAREST),
        sourceFormat(ImageFormatType_RGBA),
        destFormat(ImageFormatType_RGBA),
        wrap(WrapType_CLAMP_TO_EDGE),
        paddingValue(0),
-        outputType(DataType_DT_INVALID) {
+        outputType(DataType_DT_INVALID),
+        draw(false) {
  }
 };

@ -421,6 +423,9 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
  DataType outputType() const {
    return static_cast<DataType>(GetField<int32_t>(22, 0));
  }
+  bool draw() const {
+    return GetField<uint8_t>(24, 0) != 0;
+  }
  bool Verify(flatbuffers::Verifier &verifier) const {
    return VerifyTableStart(verifier) &&
           VerifyField<int8_t>(verifier, 4) &&
@ -437,6 +442,7 @@ struct ImageProcessParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
           VerifyOffset(verifier, 20) &&
           verifier.VerifyVector(shape()) &&
           VerifyField<int32_t>(verifier, 22) &&
+           VerifyField<uint8_t>(verifier, 24) &&
           verifier.EndTable();
  }
  ImageProcessParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@ -477,6 +483,9 @@ struct ImageProcessParamBuilder {
  void add_outputType(DataType outputType) {
    fbb_.AddElement<int32_t>(22, static_cast<int32_t>(outputType), 0);
  }
+  void add_draw(bool draw) {
+    fbb_.AddElement<uint8_t>(24, static_cast<uint8_t>(draw), 0);
+  }
  explicit ImageProcessParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
        : fbb_(_fbb) {
    start_ = fbb_.StartTable();
@ -500,7 +509,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
    flatbuffers::Offset<flatbuffers::Vector<float>> transform = 0,
    int8_t paddingValue = 0,
    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
-    DataType outputType = DataType_DT_INVALID) {
+    DataType outputType = DataType_DT_INVALID,
+    bool draw = false) {
  ImageProcessParamBuilder builder_(_fbb);
  builder_.add_outputType(outputType);
  builder_.add_shape(shape);
@ -509,6 +519,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(
  builder_.add_mean(mean);
  builder_.add_destFormat(destFormat);
  builder_.add_sourceFormat(sourceFormat);
+  builder_.add_draw(draw);
  builder_.add_paddingValue(paddingValue);
  builder_.add_wrap(wrap);
  builder_.add_filterType(filterType);
@ -597,6 +608,7 @@ inline void ImageProcessParam::UnPackTo(ImageProcessParamT *_o, const flatbuffer
  { auto _e = paddingValue(); _o->paddingValue = _e; };
  { auto _e = shape(); if (_e) { _o->shape.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->shape[_i] = _e->Get(_i); } } };
  { auto _e = outputType(); _o->outputType = _e; };
+  { auto _e = draw(); _o->draw = _e; };
 }

 inline flatbuffers::Offset<ImageProcessParam> ImageProcessParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ImageProcessParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@ -617,6 +629,7 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
  auto _paddingValue = _o->paddingValue;
  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
  auto _outputType = _o->outputType;
+  auto _draw = _o->draw;
  return MNN::CreateImageProcessParam(
      _fbb,
      _filterType,
@ -628,7 +641,8 @@ inline flatbuffers::Offset<ImageProcessParam> CreateImageProcessParam(flatbuffer
      _transform,
      _paddingValue,
      _shape,
-      _outputType);
+      _outputType,
+      _draw);
 }

 inline const flatbuffers::TypeTable *SampleModeTypeTable() {
@ -803,7 +817,8 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
    { flatbuffers::ET_FLOAT, 1, -1 },
    { flatbuffers::ET_CHAR, 0, -1 },
    { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 0, 3 }
+    { flatbuffers::ET_INT, 0, 3 },
+    { flatbuffers::ET_BOOL, 0, -1 }
  };
  static const flatbuffers::TypeFunction type_refs[] = {
    FilterTypeTypeTable,
@ -821,10 +836,11 @@ inline const flatbuffers::TypeTable *ImageProcessParamTypeTable() {
    "transform",
    "paddingValue",
    "shape",
-    "outputType"
+    "outputType",
+    "draw"
  };
  static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 10, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 11, type_codes, type_refs, nullptr, names
  };
  return &tt;
 }
--- a/schema/default/UserDefine.fbs
+++ b/schema/default/UserDefine.fbs
@ -62,4 +62,5 @@ table ImageProcessParam {
    paddingValue:byte = 0;
    shape:[int]; // shape: [N, C, H, W]
    outputType:DataType;
+    draw:bool = false;
 }
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@ -170,7 +170,7 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
    mPrecisionMode = precision;
    mCoreFunctions = MNNGetCoreFunctions();
    mInt8CoreFunctions = MNNGetInt8CoreFunctions();
-    mCache = new CPUResizeCache(this);
+    mCache = new CPUResizeCache;
 }

 CPUBackend::~CPUBackend() {
--- a/source/backend/cpu/CPUImageProcess.cpp
+++ b/source/backend/cpu/CPUImageProcess.cpp
@ -87,6 +87,19 @@ BLITTER CPUImageProcess::choose(ImageFormatType source, ImageFormatType dest) {
    return nullptr;
 }

+BLITTER CPUImageProcess::choose(int channelByteSize) {
+    switch (channelByteSize) {
+        case 4:
+            return MNNC4blitH;
+        case 3:
+            return MNNC3blitH;
+        case 1:
+            return MNNC1blitH;
+        default:
+            return nullptr;
+    }
+}
+
 SAMPLER CPUImageProcess::choose(ImageFormatType format, FilterType type, bool identity) {
    if (identity) {
        switch (format) {
@ -271,10 +284,21 @@ static std::pair<int, int> _computeClip(CV::Point* points, int iw, int ih, const
 }

 ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input = inputs[0], output = outputs[0];
-    ih = input->height();
-    iw = input->width();
-    ic = input->channel();
+    auto input = inputs[0];
+    if (input->dimensions() == 3) {
+        ih = input->length(0);
+        iw = input->length(1);
+        ic = input->length(2);
+    } else {
+        ih = input->height();
+        iw = input->width();
+        ic = input->channel();
+    }
+    if (draw) {
+        blitter = choose(ic * inputs[0]->getType().bytes());
+        return NO_ERROR;
+    }
+    auto output = outputs[0];
    oh = output->height();
    ow = output->width();
    oc = output->channel();
@ -321,15 +345,37 @@ ErrorCode CPUImageProcess::onResize(const std::vector<Tensor *> &inputs, const s

 ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto source = inputs[0]->host<uint8_t>();
-    auto dest = outputs[0]->host<void>();
+    void* dest = nullptr;
    CV::Point points[2];
-    int tileCount = UP_DIV(ow, CACHE_SIZE);
    auto destBytes = dtype.bytes();
-    for (int dy = 0; dy < oh; ++dy) {
+    int tileCount = UP_DIV(ow, CACHE_SIZE);
+    const int* regions = nullptr;
+    if (draw) {
+        // change input to output
+        dest = source;
+        oh = inputs[1]->length(0);
+        ow = iw;
+        oc = ic;
+        destBytes = inputs[0]->getType().bytes();
+        // draw one
+        tileCount = 1;
+        // src is color
+        samplerDest = inputs[2]->host<uint8_t>();
+        // get region info ptr
+        regions = inputs[1]->host<int>();
+    } else {
+        dest = outputs[0]->host<void>();
+    }
+    for (int i = 0; i < oh; ++i) {
+        int dy = draw ? regions[3 * i] : i;
        auto dstY = (uint8_t*)dest + dy * destBytes * ow * oc;
        for (int tIndex = 0; tIndex < tileCount; ++tIndex) {
            int xStart    = tIndex * CACHE_SIZE;
            int count     = std::min(CACHE_SIZE, ow - xStart);
+            if (draw) {
+                xStart = regions[3 * i + 1];
+                count = regions[3 * i + 2] - xStart + 1;
+            }
            auto dstStart = dstY + destBytes * oc * xStart;
          
            if (!blitFloat) {
@ -340,7 +386,7 @@ ErrorCode CPUImageProcess::onExecute(const std::vector<Tensor *> &inputs, const
            }

            // Sample
-            {
+            if (!draw) {
                // Compute position
                points[0].fX = xStart;
                points[0].fY = dy;
--- a/source/backend/cpu/CPUImageProcess.hpp
+++ b/source/backend/cpu/CPUImageProcess.hpp
@ -23,6 +23,10 @@ typedef void (*SAMPLER)(const unsigned char* source, unsigned char* dest, CV::Po
 class CPUImageProcess : public Execution {
 public:
    CPUImageProcess(CV::ImageProcess::Config config, const CoreFunctions* coreFunctions) : Execution(nullptr), coreFunctions(coreFunctions) {
+        if (config.draw) {
+            draw = true;
+            return;
+        }
        filterType = (FilterType)config.filterType;
        wrap = (WrapType)config.wrap;
        sourceFormat = (ImageFormatType)config.sourceFormat;
@ -40,6 +44,11 @@ public:
        paddingValue = val;
    }
    CPUImageProcess(Backend *bn, const ImageProcessParam* process) : Execution(bn) {
+        coreFunctions = static_cast<CPUBackend*>(backend())->functions();
+        draw = process->draw();
+        if (draw) {
+            return;
+        }
        filterType = process->filterType();
        wrap = process->wrap();
        sourceFormat = process->sourceFormat();
@ -53,12 +62,12 @@ public:
            transform.set(i, process->transform()->Get(i));
        }
        transform.invert(&transformInvert);
-        coreFunctions = static_cast<CPUBackend*>(backend())->functions();
    }
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
    BLITTER choose(ImageFormatType source, ImageFormatType dest);
+    BLITTER choose(int channelByteSize);
    BLIT_FLOAT choose(ImageFormatType format, int dstBpp = 0);
    SAMPLER choose(ImageFormatType format, FilterType type, bool identity);
 private:
@ -78,6 +87,7 @@ private:
    std::unique_ptr<uint8_t[]> samplerBuffer, blitBuffer;
    uint8_t* samplerDest = nullptr, *blitDest = nullptr;
    const CoreFunctions* coreFunctions = nullptr;
+    bool draw = false;
 };
 }; // namespace MNN

--- a/source/backend/cpu/CPUNonMaxSuppressionV2.cpp
+++ b/source/backend/cpu/CPUNonMaxSuppressionV2.cpp
@ -117,6 +117,9 @@ ErrorCode CPUNonMaxSuppressionV2::onExecute(const std::vector<Tensor*>& inputs,
    const auto scores          = inputs[1]->host<float>();
    NonMaxSuppressionSingleClasssImpl(inputs[0], scores, maxDetections, iouThreshold, scoreThreshold, &selected);
    std::copy_n(selected.begin(), selected.size(), outputs[0]->host<int32_t>());
+    for (int i = selected.size(); i < outputs[0]->elementSize(); i++) {
+        outputs[0]->host<int32_t>()[i] = -1;
+    }

    return NO_ERROR;
 }
--- a/source/backend/cpu/CPUResizeCache.hpp
+++ b/source/backend/cpu/CPUResizeCache.hpp
@ -6,11 +6,11 @@
 #include "MNN_generated.h"

 namespace MNN {
-class CPUBackend;
-class CPUResizeCache {
+// FIXME: Move outside
+class MNN_PUBLIC CPUResizeCache {
 public:
-    CPUResizeCache(const CPUBackend* backend) {
-        mBackend = backend;
+    CPUResizeCache() {
+        // Do nothing
    }
    ~ CPUResizeCache() {
        // Do nothing
@ -21,7 +21,6 @@ public:
    void reset();
 private:
    std::map<std::pair<const Tensor*, MNN_DATA_FORMAT>, std::shared_ptr<Tensor>> mFormatCache;
-    const CPUBackend* mBackend;
 };
 }

--- a/source/backend/cpu/CPUScatterNd.cpp
+++ b/source/backend/cpu/CPUScatterNd.cpp
@ -45,7 +45,7 @@ void ScatterNdImpl(const Tensor* indices, const Tensor* updates, const Tensor* s
        }
        if (valid) {
            for (int k = 0; k < accNumber; ++k) {
-                outputPtr[pos + k] += updatesPtr[i * accNumber + k];
+                outputPtr[pos + k] = updatesPtr[i * accNumber + k];
            }
        }
    }
@ -59,7 +59,12 @@ ErrorCode CPUScatterNd::onExecute(const std::vector<Tensor*>& inputs, const std:
    const int outputSize = output->size();

    auto outputRawPtr = output->host<int8_t>();
-    memset(outputRawPtr, 0, outputSize);
+    if (inputs.size() < 4) {
+        memset(outputRawPtr, 0, outputSize);
+    } else {
+        auto inputRawPtr = inputs[3]->host<int8_t>();
+        memcpy(outputRawPtr, inputRawPtr, outputSize);
+    }

    auto updatesDataType = updates->getType();
    if (updatesDataType == halide_type_of<int32_t>()) {
--- a/source/backend/cpu/compute/ImageProcessFunction.cpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.cpp
@ -1065,3 +1065,21 @@ void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN
    auto countC2 = ((count + 1) / 2);
    _swapUV(destUV, destUV, countC2);
 }
+
+void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + 3 * i, source, 3);
+    }
+}
+
+void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + 4 * i, source, 4);
+    }
+}
+
+void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count) {
+    for (int i = 0; i < count; i++) {
+        memcpy(dest + i, source, 1);
+    }
+}
--- a/source/backend/cpu/compute/ImageProcessFunction.hpp
+++ b/source/backend/cpu/compute/ImageProcessFunction.hpp
@ -132,4 +132,8 @@ void MNNSamplerNV12Copy(const unsigned char* source, unsigned char* dest, MNN::C
                        size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
 void MNNSamplerNV12Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
                           size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+// draw blit
+void MNNC1blitH(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC3blitH(const unsigned char* source, unsigned char* dest, size_t count);
+void MNNC4blitH(const unsigned char* source, unsigned char* dest, size_t count);
 #endif /* ImageProcessFunction_hpp */
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@ -1,29 +1,72 @@
+# Process asm file on Windows, then subsitute *.S by *.S.obj as source file of add_library
+# If MNN_ASSEMBLER env var is not set, ignore *.S file, which may cause low performance
+set(EXTRA_OBJS "")
+IF(MSVC AND (DEFINED ENV{MNN_ASSEMBLER}) AND "${CMAKE_SIZEOF_VOID_P}" STREQUAL "8")
+    set(WIN_USE_ASM ON)
+ENDIF()
+message(STATUS "WIN_USE_ASM: ${WIN_USE_ASM}")
+function (process_asm TARGET_NAME FILE_SRCS)
+    if(NOT MSVC)
+        return()
+    endif()
+    set(FILE_DESTS "")
+    foreach(SRC ${${FILE_SRCS}})
+        get_filename_component(SRC_EXT ${SRC} EXT)
+        if(NOT ${SRC_EXT} STREQUAL ".S")
+            list(APPEND FILE_DESTS ${SRC})
+            continue()
+        elseif(NOT WIN_USE_ASM)
+            continue()
+        endif()
+        string(REPLACE ${CMAKE_CURRENT_SOURCE_DIR} "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TARGET_NAME}.dir" DEST ${SRC})
+        add_custom_command(
+            OUTPUT ${DEST}.obj
+            # *.S -> *.S.i: do preprocess(define/ifdef macro) by cl.exe
+            COMMAND "${CMAKE_C_COMPILER}" /DWIN32 /experimental:preprocessor /P /Fi"${DEST}.i" "${SRC}"
+            # *.S.i -> *.S.obj, use gnu assembler which support (AT&T syntax)
+            COMMAND "$ENV{MNN_ASSEMBLER}" -o "${DEST}.obj" "${DEST}.i"
+        )
+        list(APPEND EXTRA_OBJS ${DEST}.obj)
+    endforeach()
+    set(${FILE_SRCS} ${FILE_DESTS} PARENT_SCOPE)
+    set(EXTRA_OBJS ${EXTRA_OBJS} PARENT_SCOPE)
+endfunction()
+
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
    message(STATUS "${CMAKE_SYSTEM_PROCESSOR}: Open SSE")
    target_compile_options(MNNCPU PRIVATE -DMNN_USE_SSE)
    option(MNN_AVX512_VNNI "Enable AVX512 VNNI" ON)
    FILE(GLOB MNN_X8664_SRC ${CMAKE_CURRENT_LIST_DIR}/*)
-    if (MSVC)
-        FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*.cpp)
-        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*.cpp)
-    else()
-        FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
-        FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
-        message(STATUS "MNN_AVX512:${MNN_AVX512}")
-        if (MNN_AVX512)
-            FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
-            SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
-            LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
-            add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
-            target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
-            if (MNN_AVX512_VNNI)
-                target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
-                add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
-                target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni -DMNN_AVX512_VNNI)
+    FILE(GLOB MNN_AVX_SRC ${CMAKE_CURRENT_LIST_DIR}/avx/*)
+    FILE(GLOB MNN_AVXFMA_SRC ${CMAKE_CURRENT_LIST_DIR}/avxfma/*)
+    message(STATUS "MNN_AVX512:${MNN_AVX512}")
+    if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
+        FILE(GLOB MNN_AVX512_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/*)
+        SET(MNNAVX512_VNNI_SRC ${CMAKE_CURRENT_LIST_DIR}/avx512/GemmInt8_VNNI.cpp)
+        LIST(REMOVE_ITEM MNN_AVX512_SRC ${MNNAVX512_VNNI_SRC})
+        process_asm(MNNAVX512 MNN_AVX512_SRC)
+        add_library(MNNAVX512 OBJECT ${MNN_AVX512_SRC})
+        target_compile_options(MNNAVX512 PRIVATE -DMNN_USE_SSE -DMNN_X86_USE_ASM)
+        if (MSVC)
+            target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
+        else()
+            target_compile_options(MNNAVX512 PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma)
+        endif()
+        if (MNN_AVX512_VNNI)
+            target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
+            add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
+            target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_AVX512_VNNI)
+            if (MSVC)
+                target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
+            else()
+                target_compile_options(MNNAVX512_VNNI PRIVATE -m64 -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -mavx512vnni)
            endif()
        endif()
    endif()
    FILE(GLOB MNN_SSE_SRC ${CMAKE_CURRENT_LIST_DIR}/sse/*)
+    process_asm(MNNAVX MNN_AVX_SRC)
+    process_asm(MNNAVXFMA MNN_AVXFMA_SRC)
+    process_asm(MNNSSE MNN_SSE_SRC)
    add_library(MNNX8664 OBJECT ${MNN_X8664_SRC})
    add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
    add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
@ -34,7 +77,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
    target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE)
    if(MSVC)
        target_compile_options(MNNAVX PRIVATE /arch:AVX)
-        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX)
+        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2)
    else()
        target_compile_options(MNNSSE PRIVATE -msse4.1)
        target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
@ -47,7 +90,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
        endif()
    endif()
    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
-    if (MNN_AVX512)
+    if (MSVC AND WIN_USE_ASM)
+        target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM)
+        target_compile_options(MNNAVXFMA PRIVATE -DMNN_X86_USE_ASM)
+        list(APPEND MNN_OBJECTS_TO_LINK ${EXTRA_OBJS})
+    endif()
+    if (MNN_AVX512 AND ((NOT MSVC) OR WIN_USE_ASM))
        target_compile_options(MNNCPU PRIVATE -DMNN_AVX512)
        target_compile_options(MNNX8664 PRIVATE -DMNN_AVX512)
        if (MNN_AVX512_VNNI)
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit.S
@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_UnitMain


 // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
 pushq   %rbp
 movq    %rsp, %rbp

 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -41,6 +42,17 @@ movq %r9, %rcx
 movq %r10, %r9
 pushq   %r14
 pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@ -304,6 +316,17 @@ addq $64, %rsp
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1.S
@ -24,12 +24,13 @@ asm_function _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_1


 // SystemV Auto: rdi: dst, rsi:src, rdx:weight, rcx:strides, r8: post
-// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// Microsoft x64 Auto: rcx:dst, rdx:src, r8:weight, r9:strides
 pushq   %rbp
 movq    %rsp, %rbp

 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -41,6 +42,17 @@ movq %r9, %rcx
 movq %r10, %r9
 pushq   %r14
 pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@ -190,6 +202,17 @@ addq $64, %rsp
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx1EFMA_ASM.S
@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1EFMA_ASM


 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters

 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@ -42,7 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
-
+#endif

 movq (%rdi),    %rax    // %rax C
 movq 8(%rdi),   %rbx    // %rbx A
@ -215,6 +239,27 @@ LoopE24H1:
            jmp LoopE24H1

 End:
+
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@ -223,6 +268,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq

--- a/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avx/_AVX_MNNPackedSparseMatMulEpx4EFMA_ASM.S
@ -30,10 +30,33 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4EFMA_ASM
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@ -42,6 +65,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif

 movq (%rdi),    %rax        // %rax C
 movq 8(%rdi),   %rbx        // %rbx A
@ -216,6 +240,26 @@ LoopE24H4:
            jmp LoopE24H4
    
 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@ -224,6 +268,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq

--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunction.hpp
@ -26,23 +26,29 @@ constexpr int AVX512F32 = 16;
        _mm_store_ps(dest + AVX512F32 * packCUnit * ablock + 4 * packCUnit * aSegment + packCUnit * 3, m128_3); \
    }

-#define STORE_VECTOR_AS_COLUMN(dest, ablock, packCUnit, vacc)         \
-    dest[AVX512F32 * packCUnit * ablock + 0]              = vacc[0];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit]      = vacc[1];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 2]  = vacc[2];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 3]  = vacc[3];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 4]  = vacc[4];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 5]  = vacc[5];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 6]  = vacc[6];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 7]  = vacc[7];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 8]  = vacc[8];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 9]  = vacc[9];  \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc[10]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc[11]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc[12]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc[13]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc[14]; \
-    dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc[15];
+inline void STORE_VECTOR_AS_COLUMN(float* dest, size_t ablock, size_t packCUnit, __m512 vacc) {
+    union {
+        __m512 v;
+        float f[16];
+    } vacc_u;
+    vacc_u.v = vacc;
+    dest[AVX512F32 * packCUnit * ablock + 0]              = vacc_u.f[0];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit]      = vacc_u.f[1];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 2]  = vacc_u.f[2];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 3]  = vacc_u.f[3];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 4]  = vacc_u.f[4];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 5]  = vacc_u.f[5];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 6]  = vacc_u.f[6];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 7]  = vacc_u.f[7];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 8]  = vacc_u.f[8];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 9]  = vacc_u.f[9];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 10] = vacc_u.f[10];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 11] = vacc_u.f[11];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 12] = vacc_u.f[12];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 13] = vacc_u.f[13];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 14] = vacc_u.f[14];
+    dest[AVX512F32 * packCUnit * ablock + packCUnit * 15] = vacc_u.f[15];
+}

 #define TRANSPOSE4x8_STORE(dest, ablock, aSegment, packCUnit, v0, v3, v6, v9, v12, v15, v18, v21) {        \
    auto m0 = _mm512_extractf32x4_ps(v0, aSegment);                                                         \
@ -125,14 +131,20 @@ constexpr int AVX512F32 = 16;
        _mm256_storeu_ps(dest + packCUnit * 7, t7);                                             \
    }

-#define STORE_M256_VECTOR_AS_COLUMN(dest, packCUnit, vacc) \
-    dest[0]             = vacc[0];                         \
-    dest[packCUnit]     = vacc[1];                         \
-    dest[packCUnit * 2] = vacc[2];                         \
-    dest[packCUnit * 3] = vacc[3];                         \
-    dest[packCUnit * 4] = vacc[4];                         \
-    dest[packCUnit * 5] = vacc[5];                         \
-    dest[packCUnit * 6] = vacc[6];                         \
-    dest[packCUnit * 7] = vacc[7];
+inline void STORE_M256_VECTOR_AS_COLUMN(float* dest, size_t packCUnit, __m256 vacc) {
+    union {
+        __m256 v;
+        float f[8];
+    } vacc_u;
+    vacc_u.v = vacc;
+    dest[0]             = vacc_u.f[0];
+    dest[packCUnit]     = vacc_u.f[1];
+    dest[packCUnit * 2] = vacc_u.f[2];
+    dest[packCUnit * 3] = vacc_u.f[3];
+    dest[packCUnit * 4] = vacc_u.f[4];
+    dest[packCUnit * 5] = vacc_u.f[5];
+    dest[packCUnit * 6] = vacc_u.f[6];
+    dest[packCUnit * 7] = vacc_u.f[7];
+}

-#endif
+#endif
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx1.cpp
@ -228,9 +228,14 @@ void _AVX512_MNNPackedSparseMatMulEpx1(float* C, const float* A, const float* B,
            vacc0 = _mm256_min_ps(vacc0, _mm512_extractf32x8_ps(vmax, 0));
            vacc0 = _mm256_max_ps(vacc0, _mm512_extractf32x8_ps(vmin, 0));

+            union {
+                __m256 v;
+                float f[8];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
            // how to store faster: st4 / transpose
            for (auto iStore = 0; iStore < (taileSize & 0x07); iStore++) {
-                 c[packCUnit * iStore] = vacc0[iStore];
+                 c[packCUnit * iStore] = vacc0_u.f[iStore];
            }
        }
        // ie += taileSize;
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx4.cpp
@ -647,10 +647,15 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
            vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
            vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));

-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
-            c[packCUnit * 2] = vacc0[2];
-            c[+packCUnit * 3] = vacc0[3];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
+            c[packCUnit * 2] = vacc0_u.f[2];
+            c[+packCUnit * 3] = vacc0_u.f[3];
        }
        ie += 4;
        a += 4;
@ -735,8 +740,13 @@ void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B,
            vacc0 = _mm_min_ps(vacc0, _mm512_extractf32x4_ps(vmax, 0));
            vacc0 = _mm_max_ps(vacc0, _mm512_extractf32x4_ps(vmin, 0));

-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
        }
        ie += 2;
        a += 2;
--- a/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp
+++ b/source/backend/cpu/x86_x64/avx512/SparseKernelFunctionEpx8.cpp
@ -789,10 +789,15 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
            vacc0 = _mm_min_ps(vacc0, vmax);
            vacc0 = _mm_max_ps(vacc0, vmin);

-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
-            c[packCUnit * 2] = vacc0[2];
-            c[packCUnit * 3] = vacc0[3];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
+            c[packCUnit * 2] = vacc0_u.f[2];
+            c[packCUnit * 3] = vacc0_u.f[3];
        }
        ie += 4;
        a += 4;
@ -877,8 +882,13 @@ void _AVX512_MNNPackedSparseMatMulEpx8(float* C, const float* A, const float* B,
            vacc0 = _mm_min_ps(vacc0, vmax);
            vacc0 = _mm_max_ps(vacc0, vmin);

-            c[0] = vacc0[0];
-            c[packCUnit] = vacc0[1];
+            union {
+                __m128 v;
+                float f[4];
+            } vacc0_u;
+            vacc0_u.v = vacc0;
+            c[0] = vacc0_u.f[0];
+            c[packCUnit] = vacc0_u.f[1];
        }
        ie += 2;
        a += 2;
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit16x8.S
@ -11,16 +11,15 @@
 .align 4

 asm_function _AVX512_MNNGemmFloatUnit16x8
-//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit16x8(float* C, const float* A, const float* B, const size_t* parameter)

-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx

 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -30,12 +29,21 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
 pushq   %r14
-movq %r8, %r9
 #endif

 movq 40(%rcx), %r10 // bExtraStride
@ -266,6 +274,17 @@ LoopDz:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit32x8.S
@ -11,16 +11,15 @@
 .align 4

 asm_function _AVX512_MNNGemmFloatUnit32x8
-//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit32x8(float* C, const float* A, const float* B, const size_t* parameter)

-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx

 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -30,12 +29,21 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
 pushq   %r14
-movq %r8, %r9
 #endif

 movq 40(%rcx), %r10 // bExtraStride
@ -301,6 +309,17 @@ LoopDz:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8.S
@ -11,16 +11,15 @@
 .align 4

 asm_function _AVX512_MNNGemmFloatUnit48x8
-//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter, size_t hC4)
+//void _AVX512_MNNGemmFloatUnit48x8(float* C, const float* A, const float* B, const size_t* parameter)

-// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: hC4
+// SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter
 // Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
 pushq   %rbp
 movq    %rsp, %rbp
 pushq   %rbx

 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -29,11 +28,20 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
-movq %r8, %r9
 #endif

 movq 40(%rcx), %r10 // bExtraStride
@ -336,10 +344,22 @@ LoopDz:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
 popq    %rdi
+popq    %rbx
 popq    %rbp
 #else
 popq    %r13
--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNGemmFloatUnit48x8Fused.S
@ -14,9 +14,22 @@ asm_function _AVX512_MNNGemmFloatUnit48x8Fused
 //void _AVX512_MNNGemmFloatUnit48x8Fused(float* C, const float* A, const float* B, const size_t* parameter, const float* p, const float* bias)

 // SystemV Auto: rdi: C, rsi:A, rdx:B, rcx:parameter, r8: postParameters, r9:bias
+
+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:parameter
+// stack: postParameters, bias
 pushq   %rbp
 movq    %rsp, %rbp

+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r8 // postParameters
+movq (push_registers_bytes + 8)(%rsp), %r9 // bias
 pushq   %rbx
 pushq   %r12
 pushq   %r13
@ -24,6 +37,26 @@ pushq   %r14
 pushq   %r15
 movq %r8, %r14
 movq %r9, %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+movq %r8, %r14
+movq %r9, %r15
+#endif

 movq 40(%rcx), %r10 // bExtraStride
 movq 24(%rcx), %r8 // cStride
@ -402,12 +435,33 @@ LoopDz:

 End:

+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
 popq    %r12
 popq    %rbx
-popq    %rbp
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+#endif

+popq    %rbp
 retq

--- a/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_MNNPackedSparseMatMulEpx4.S
@ -12,8 +12,6 @@

 #define AVX512F32 16

-#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
-
 // caution: asm version is a sub-loop of _AVX512_MNNPackedSparseMatMulEpx4()
 // void _AVX512_MNNPackedSparseMatMulEpx4(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
 //                                     const float* postParameters, const float* bias, unsigned int* NNZMap,
@ -22,8 +20,29 @@ asm_function _AVX512_MNNPackedSparseMatMulEpx4_ASM
 // SystemV Auto: rdi: C, rsi: A, rdx:B,  rcx: eSize, r8: parameter, r9: postparameter,
 // stack: bias, unsigned int* NNZMap, int* dataOffsetMap

+// Microsoft x64 Auto: rcx:C, rdx:A, r8:B, r9:eSize
+// stack: parameter, postParameters, bias, unsigned int* NNZMap, int* dataOffsetMap
+
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+#define push_registers_bytes_ ((8 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes_)(%rsp), %r8 // parameter
+movq (push_registers_bytes_ + 8)(%rsp), %r9 // postparameter
+#define push_registers_bytes (push_registers_bytes_ + 2 * 8) // pushq + callq + shadow_space + extra
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@ -32,7 +51,8 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
-
+#define push_registers_bytes ((9 + 1) * 8) // pushq + callq
+#endif

 movq (%r8), %r10 // eP * sizeof
 shrq  $(sizeof_value_lg2), %r10
@ -65,8 +85,8 @@ vbroadcastss 8(%r9), %zmm10
 vbroadcastss 12(%r9), %zmm11

 movq %r10, %r14
-shrq $sparse_blockoc_log, %r14
-shlq $sparse_blockoc_log, %r14 // h even divid sparse_blockoc
+shrq $(sparse_blockoc_log), %r14
+shlq $(sparse_blockoc_log), %r14 // h even divid sparse_blockoc

 movq (push_registers_bytes)(%rsp), %rdx // bias
 movq (push_registers_bytes + 8)(%rsp), %rdi // unsigned int* NNZMap,
@ -79,6 +99,20 @@ movq (push_registers_bytes + 16)(%rsp), %rsi // int* dataOffsetMap
 // movq %r8, %rdi
 // movq %r9, %rsi

+#ifdef WIN32
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#endif
+
 movslq (%rsi), %r15
 leaq (%rax, %r15, 4), %rax // a = a + diff;
 addq $4, %rsi // dataOffsetMap++
@ -90,7 +124,7 @@ je loop_e48h4_end
 loop_e48h4:
    movq %r8, %r9
    movq %r8, %r12
-    shrq $packC_unit_log, %r9
+    shrq $(packC_unit_log), %r9
    andq $15, %r12 // ih % packC_unit
    leaq (%rcx, %r12, sizeof_value), %r12
    imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@ -246,7 +280,7 @@ loop_e48h4:

    subq $4, %rsi // dataOffsetMap--
    movslq (%rsi), %r15
-    addq $sparse_blockoc, %r8
+    addq $(sparse_blockoc), %r8
    addq $4, %rdi
    negq %r15
    leaq (%rax, %r15, sizeof_value), %rax // a = a - diff;
@ -284,7 +318,7 @@ je loop_end
 loop_e48h1:
    movq %r8, %r9
    movq %r8, %r12
-    shrq $packC_unit_log, %r9
+    shrq $(packC_unit_log), %r9
    andq $15, %r12 // ih % packC_unit
    leaq (%rcx, %r12, sizeof_value), %r12
    imulq %r11, %r9 // (ih >> packC_unit_log) * cStride
@ -433,15 +467,37 @@ loop_e48h1_end:

 loop_end:

-popq %r15
-popq %r14
-popq %r13
-popq %r12
-popq %r9
-popq %r8
-popq %rbx
-popq %rax
-popq %rbp
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %r9
+popq    %r8
+popq    %rbx
+popq    %rax
+#endif
+
+popq    %rbp

 retq

--- a/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S
+++ b/source/backend/cpu/x86_x64/avx512/_AVX512_TransposeMain.S
@ -21,7 +21,6 @@ pushq   %rbp
 movq    %rsp, %rbp

 #ifdef WIN32
-movq 48(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -31,7 +30,17 @@ movq %rcx, %rdi
 movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
-movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@ -179,6 +188,17 @@ Loop:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r14
 popq    %r13
 popq    %r12
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA.S
@ -19,7 +19,8 @@ pushq   %rbp
 movq    %rsp, %rbp

 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32)
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -29,6 +30,17 @@ movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
 movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@ -216,6 +228,17 @@ LoopDz:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA6x16.S
@ -19,7 +19,8 @@ pushq   %rbp
 movq    %rsp, %rbp

 #ifdef WIN32
-movq 48(%rsp), %r10
+#define push_registers_bytes ((1 + 1) * 8 + 32)
+movq (push_registers_bytes)(%rsp), %r10
 pushq %rdi
 pushq %rsi
 pushq %r12
@ -29,6 +30,17 @@ movq %rdx, %rsi
 movq %r8, %rdx
 movq %r9, %rcx
 movq %r10, %r9
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
 #else
 pushq   %r12
 pushq   %r13
@ -191,6 +203,17 @@ LoopDz:
 End:

 #ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r13
 popq    %r12
 popq    %rsi
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNGemmFloatUnitMainFMA_Fused.S
@ -18,12 +18,41 @@ asm_function _AVX_MNNGemmFloatUnitMainFMA_Fused
 pushq   %rbp
 movq    %rsp, %rbp

+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+#define push_registers_bytes ((3 + 1) * 8 + 32) // pushq + callq + shadow_space
+movq (push_registers_bytes)(%rsp), %r8
+movq (push_registers_bytes + 8)(%rsp), %r9
 pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
 movq %r8, %r14
 movq %r9, %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+movq %r8, %r14
+movq %r9, %r15
+#endif

 movq 40(%rcx), %r10 // bExtraStride
 movq 24(%rcx), %r8 // cStride
@ -232,10 +261,30 @@ LoopDz:

 End:

+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
 popq    %r15
 popq    %r14
 popq    %r13
 popq    %r12
+popq    %rsi
+popq    %rdi
+#else
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+#endif
 popq    %rbp

 retq
--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx1NFMA_ASM.S
@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx1NFMA_ASM


 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters

 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@ -42,6 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif


 movq (%rdi),    %rax    // %rax C
@ -203,6 +228,26 @@ LoopE24H1:
            jmp LoopE24H1

 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@ -211,6 +256,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq

--- a/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S
+++ b/source/backend/cpu/x86_x64/avxfma/_AVX_MNNPackedSparseMatMulEpx4NFMA_ASM.S
@ -25,15 +25,39 @@ asm_function _AVX_MNNPackedSparseMatMulEpx4NFMA_ASM


 // SystemV Auto: rdi: packedParas, rsi: bias, rdx: parameter, rcx: postParameters
+// Microsoft x64 Auto: rcx:packedParas, rdx:bias, r8:parameter, r9:postParameters

 // all callee save regs:
 // %rbx, %rbp, %r12~%r15
 // unused para regs: %r8, %r9
 // can use regs: %r8~%r15, %rdi, %rsi, %rdx, %rcx, %rbx, %rax
-
-
 pushq   %rbp
 movq    %rsp, %rbp
+
+#ifdef WIN32
+pushq   %rdi
+pushq   %rsi
+movq    %rcx, %rdi
+movq    %rdx, %rsi
+movq    %r8, %rdx
+movq    %r9, %rcx
+pushq   %rbx
+pushq   %r12
+pushq   %r13
+pushq   %r14
+pushq   %r15
+leaq (-1280)(%rsp), %rsp
+vmovdqu %xmm6,  (128*0)(%rsp)
+vmovdqu %xmm7,  (128*1)(%rsp)
+vmovdqu %xmm8,  (128*2)(%rsp)
+vmovdqu %xmm9,  (128*3)(%rsp)
+vmovdqu %xmm10, (128*4)(%rsp)
+vmovdqu %xmm11, (128*5)(%rsp)
+vmovdqu %xmm12, (128*6)(%rsp)
+vmovdqu %xmm13, (128*7)(%rsp)
+vmovdqu %xmm14, (128*8)(%rsp)
+vmovdqu %xmm15, (128*9)(%rsp)
+#else
 pushq   %rax
 pushq   %rbx
 pushq   %r8
@ -42,6 +66,7 @@ pushq   %r12
 pushq   %r13
 pushq   %r14
 pushq   %r15
+#endif

 movq (%rdi),    %rax        // %rax C
 movq 8(%rdi),   %rbx        // %rbx A
@ -195,6 +220,26 @@ LoopE24H4:
            jmp LoopE24H4
    
 End:
+#ifdef WIN32
+vmovdqu (128*0)(%rsp), %xmm6
+vmovdqu (128*1)(%rsp), %xmm7
+vmovdqu (128*2)(%rsp), %xmm8
+vmovdqu (128*3)(%rsp), %xmm9
+vmovdqu (128*4)(%rsp), %xmm10
+vmovdqu (128*5)(%rsp), %xmm11
+vmovdqu (128*6)(%rsp), %xmm12
+vmovdqu (128*7)(%rsp), %xmm13
+vmovdqu (128*8)(%rsp), %xmm14
+vmovdqu (128*9)(%rsp), %xmm15
+leaq (1280)(%rsp), %rsp
+popq    %r15
+popq    %r14
+popq    %r13
+popq    %r12
+popq    %rbx
+popq    %rsi
+popq    %rdi
+#else
 popq    %r15
 popq    %r14
 popq    %r13
@ -203,6 +248,8 @@ popq    %r9
 popq    %r8
 popq    %rbx
 popq    %rax
+#endif
+
 popq    %rbp
 retq

--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@ -56,15 +56,15 @@ message(STATUS "message ${CUDA_NVCC_FLAGS} !!!!!!!!!!!")

 if(WIN32)
    cuda_add_library(MNN_CUDA STATIC Register.cpp ${MNN_CUDA_SRC})
-    string(REPLACE "cublas.lib" "cudnn.lib" CUDNN_LIBRARIES ${CUDA_CUBLAS_LIBRARIES})
-    set(MNN_CUDA_LIBS MNN_CUDA ${CUDNN_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES} PARENT_SCOPE)
+    set(MNN_CUDA_LIBS MNN_CUDA ${CUDA_LIBRARIES} PARENT_SCOPE)
 else()
    cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
-    set(MNN_CUDA_LIBS MNN_Cuda_Main cudnn cublas PARENT_SCOPE)
+    set(MNN_CUDA_LIBS MNN_Cuda_Main PARENT_SCOPE)
    add_library(MNN_CUDA OBJECT Register.cpp)
 endif()

 include_directories(
+    ${CMAKE_CURRENT_LIST_DIR}/
    ${CUDA_INCLUDE_DIRS}
    ${CMAKE_SOURCE_DIR}/include/
 )
--- a/source/backend/cuda/core/CUDABackend.cpp
+++ b/source/backend/cuda/core/CUDABackend.cpp
@ -14,6 +14,11 @@
 #include "core/Macro.h"
 #include "shape/SizeComputer.hpp"
 #include "core/TensorUtils.hpp"
+#include "execution/Raster.cuh"
+#include "execution/Transpose.cuh"
+#include "execution/MNNCUDADefine.hpp"
+
+// #define MNN_CUDA_COPY_DEBUG

 namespace MNN {
 namespace CUDA {
@ -30,22 +35,18 @@ public:
        // Do nothing
    }
    virtual ~ CUDARuntimeAllocator() = default;
-    virtual std::pair<void*, int> onAlloc(int size, int align) override {
+    virtual std::pair<void*, size_t> onAlloc(size_t size, size_t align) override {
        return std::make_pair(mRuntime->alloc(size), 0);
    }
-    virtual void onRelease(std::pair<void*, int> ptr) override {
+    virtual void onRelease(std::pair<void*, size_t> ptr) override {
        mRuntime->free(ptr.first);
    }
 private:
    CUDARuntime* mRuntime;
 };
 CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, BackendConfig::PowerMode power) {
-    // Shader precision
-    if (precision == BackendConfig::Precision_Low) {
-        mCUDARuntime.reset(new CUDARuntime(true, -1));
-    } else {
-        mCUDARuntime.reset(new CUDARuntime(false, -1));
-    }
+    // TODO: Search CUDA Device info and use best one
+    mCUDARuntime.reset(new CUDARuntime(-1));
    if (mCUDARuntime.get()) {
        if (mCUDARuntime->isCreateError() == true) {
            mIsCreateError = true;
@ -54,6 +55,7 @@ CUDARuntimeWrapper::CUDARuntimeWrapper(BackendConfig::PrecisionMode precision, B
        std::shared_ptr<BufferAllocator::Allocator> allocator(new CUDARuntimeAllocator(mCUDARuntime.get()));
        mBufferPool.reset(new BufferAllocator(allocator));
    }
+    mDefaultPrecision = precision;
 }
 CUDARuntimeWrapper::~CUDARuntimeWrapper() {
    // Do nothing
@ -64,7 +66,12 @@ float CUDARuntimeWrapper::onGetMemoryInMB() {
 }

 Backend* CUDARuntimeWrapper::onCreate(const BackendConfig* config) const {
-    return new CUDABackend(mBufferPool, mCUDARuntime);
+    auto mode = mDefaultPrecision;
+    if (nullptr != config) {
+        mode = config->precision;
+    }
+    bool useFp16 = mode == BackendConfig::Precision_Low;
+    return new CUDABackend(mBufferPool, mCUDARuntime, useFp16);
 }

 void CUDARuntimeWrapper::onGabageCollect(int level) {
@ -72,11 +79,12 @@ void CUDARuntimeWrapper::onGabageCollect(int level) {
 }

 CUDABackend::CUDABackend(std::shared_ptr<BufferAllocator> st,
-                         std::shared_ptr<CUDARuntime> rt)
+                         std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32)
    : Backend(MNN_FORWARD_CUDA) {
    mBufferPool.reset(new BufferAllocator(BufferAllocator::Allocator::createRecurse(st.get())));
    mStaticBufferPool = st;
    mCUDARuntime      = rt;
+    mUseFp16AsFp32 = useFp16AsFp32;
 }

 CUDABackend::~CUDABackend() {
@ -89,6 +97,9 @@ CUDARuntime* CUDABackend::getCUDARuntime() {
    MNN_ASSERT(nullptr != mCUDARuntime.get());
    return mCUDARuntime.get();
 }
+bool CUDABackend::useFp16() const {
+    return mUseFp16AsFp32;
+}

 class CUDAMemObj : public Backend::MemObj {
 public:
@ -103,12 +114,27 @@ private:
    BufferAllocator* mAllocator;
    std::pair<void*, int> mPoint;
 };
+int CUDABackend::getBytes(const Tensor* tensor) const {
+    auto bytes = tensor->getType().bytes();
+    if (mUseFp16AsFp32) {
+        if (halide_type_float == tensor->getType().code) {
+            bytes = 2;
+        }
+    }
+    return bytes;
+}
+CPUResizeCache* CUDABackend::getCache() {
+    return &mCache;
+}
+
 Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType storageType) {
 #ifdef LOG_VERBOSE
    MNN_PRINT("Start CUDABackend::onAcquireBuffer !\n");
 #endif
    BufferAllocator* allocator = nullptr;
-    int mallocSize = realSize(nativeTensor) * nativeTensor->getType().bytes();
+    auto bytes = getBytes(nativeTensor);
+    size_t mallocSize = realSize(nativeTensor) * bytes;
+
    std::pair<void*, int> buffer;
    if (storageType == DYNAMIC_SEPERATE) {
        buffer                              = mBufferPool->alloc(mallocSize, true);
@ -132,13 +158,23 @@ Backend::MemObj* CUDABackend::onAcquire(const Tensor* nativeTensor, StorageType
 }

 bool CUDABackend::onClearBuffer() {
+    mCache.reset();
    mBufferPool->release(true);
    return true;
 }
 size_t CUDABackend::realSize(const Tensor* tensor) {
+    auto dim = TensorUtils::getDescribe(tensor)->dimensionFormat;
+    int pack = 1;
+    if (dim == MNN_DATA_FORMAT_NC4HW4) {
+        pack = PACK_NUMBER;
+    }
    size_t res = 1;
    for (int i = 0; i < tensor->dimensions(); ++i) {
-        res *= tensor->length(i);
+        size_t l = tensor->length(i);
+        if (1 == i ) {
+            l = UP_DIV(l, pack) * pack;
+        }
+        res *= l;
    }
    return res;
 }
@ -186,47 +222,332 @@ void CUDABackend::onExecuteBegin() const {

 void CUDABackend::onExecuteEnd() const {
 }
+static void _computeStride(MNN_DATA_FORMAT srcDimensionFormat, int* srcStride, int batch, int plane, int channel, int srcPack) {
+    if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        srcStride[0] = plane * srcPack;
+        srcStride[1] = plane * batch * PACK_NUMBER;
+        srcStride[2] = srcPack;
+    } else if (srcDimensionFormat == MNN_DATA_FORMAT_NCHW) {
+        srcStride[0] = channel * plane;
+        srcStride[1] = plane * PACK_NUMBER;
+        srcStride[2] = 1;
+    } else {
+        srcStride[0] = channel * plane;
+        srcStride[1] = PACK_NUMBER;
+        srcStride[2] = channel;
+    }
+}
+
+static void _computeBCA(int& batch, int& plane, int& channel, MNN_DATA_FORMAT srcDimensionFormat, const Tensor* srcTensor) {
+    if (srcDimensionFormat != MNN_DATA_FORMAT_NHWC) {
+        batch = srcTensor->length(0);
+        channel = srcTensor->length(1);
+        plane = 1;
+        for (int i=2; i<srcTensor->dimensions(); ++i) {
+            plane *= srcTensor->length(i);
+        }
+    } else {
+        batch = srcTensor->length(0);
+        channel = srcTensor->length(srcTensor->dimensions()-1);
+        plane = 1;
+        for (int i=1; i<srcTensor->dimensions()-1; ++i) {
+            plane *= srcTensor->length(i);
+        }
+    }
+}
+
+static PackInfo _computePackInfo(MNN_DATA_FORMAT srcDimensionFormat, int batch, int plane, int channel) {
+    PackInfo pack;
+    pack.inside = plane;
+    pack.axis = channel;
+    pack.unit = PACK_NUMBER;
+    pack.outside = batch;
+    if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+        pack.axisStride = 1;
+        pack.insideStride = channel;
+    } else {
+        pack.axisStride = plane;
+        pack.insideStride = 1;
+    }
+    return pack;
+}

 void CUDABackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
    auto srcDimensionFormat = TensorUtils::getDescribe(srcTensor)->dimensionFormat;
-    auto srcDevice          = srcTensor->deviceId() != 0;
-
    auto dstDimensionFormat = TensorUtils::getDescribe(dstTensor)->dimensionFormat;
+    auto srcDevice          = srcTensor->deviceId() != 0;
    auto dstDevice          = dstTensor->deviceId() != 0;
-    if (srcDevice && srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        srcDimensionFormat = MNN_DATA_FORMAT_NCHW;
+    MNN_ASSERT(srcDevice || dstDevice);
+    uint8_t* srcPtr = nullptr;
+    std::pair<void*, int> tempSrcStorage;
+    auto bytes = getBytes(srcTensor);
+    auto type = srcTensor->getType();
+#ifdef MNN_CUDA_COPY_DEBUG
+    MNN_PRINT("CUDA Bn copy: %d -> %d, format %d -> %d, dims: [", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat);
+    for (int i=0; i<srcTensor->dimensions(); ++i) {
+        MNN_PRINT("%d ", srcTensor->length(i));
    }
-    if (dstDevice && dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
-        dstDimensionFormat = MNN_DATA_FORMAT_NCHW;
+    MNN_PRINT("]\n");
+#endif
+    bool directCopy = (srcDimensionFormat == dstDimensionFormat && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) || srcTensor->dimensions() <= 1;
+    if (mUseFp16AsFp32) {
+        if ((!srcDevice) || (!dstDevice)) {
+            if (type.code == halide_type_float) {
+                directCopy = false;
+            }
+        }
    }
-    auto needSize = realSize(srcTensor) * srcTensor->getType().bytes();
-    std::shared_ptr<Tensor> srcTempTensor;
-    std::shared_ptr<Tensor> dstTempTensor;
-    
-    if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
-        mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), needSize,
-                            MNNMemcpyDeviceToDevice, true);
+    if (directCopy) {
+        auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
+        if (srcDevice && dstDevice) {
+            mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
+                                MNNMemcpyDeviceToDevice, true);
+        } else if (srcDevice && (!dstDevice)) {
+            mCUDARuntime->memcpy((void*)(dstTensor->host<void>()), (void*)(srcTensor->deviceId()), gpuSize,
+                                MNNMemcpyDeviceToHost, true);
+        } else if ((!srcDevice) && (dstDevice)) {
+            mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->host<void>()), gpuSize,
+                                MNNMemcpyHostToDevice, true);
+        }
+        return;
+    }
+    if (!srcDevice) {
+        auto cpuSize = srcTensor->size();
+        tempSrcStorage = mStaticBufferPool->alloc(cpuSize);
+        srcPtr = (uint8_t*)tempSrcStorage.first + tempSrcStorage.second;
+        mCUDARuntime->memcpy(srcPtr, srcTensor->host<void>(), cpuSize, MNNMemcpyHostToDevice,
+                             true);
+    } else {
+        srcPtr = (uint8_t*)srcTensor->deviceId();
+    }
+    uint8_t* dstPtr = nullptr;
+    std::pair<void*, int> tempDstStorage;
+    if (!dstDevice) {
+        auto cpuSize = dstTensor->size();
+        tempDstStorage = mStaticBufferPool->alloc(cpuSize);
+        dstPtr = (uint8_t*)tempDstStorage.first + tempDstStorage.second;
+    } else {
+        dstPtr = (uint8_t*)dstTensor->deviceId();
    }
-    if (srcTensor->deviceId() != 0 && dstTensor->deviceId() == 0) {
-        if(srcDimensionFormat != dstDimensionFormat) {

-            dstTempTensor.reset(new Tensor(srcTensor, srcTensor->getDimensionType(), true));
-            mCUDARuntime->memcpy(dstTempTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
-                             true);
-            MNNCPUCopyBuffer(dstTempTensor.get(), dstTensor);
+    // Format convert
+    FuseRegion reg;
+    int* size = reg.size;
+    int* srcStride = reg.srcStride;
+    int* dstStride = reg.dstStride;
+    int offset[PACK_NUMBER * 8];
+    int offsetNumber = 0;
+    auto offsetGpuStorage = mStaticBufferPool->alloc(PACK_NUMBER * 8 * sizeof(int));
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+    auto regionStorage = mStaticBufferPool->alloc(sizeof(FuseRegion));
+    auto regionGpu = (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second);
+
+    do {
+        if (srcTensor->deviceId() != 0 && dstTensor->deviceId() != 0) {
+            if (srcTensor->dimensions() <= 1 || srcDimensionFormat == dstDimensionFormat) {
+                auto gpuSize = realSize(srcTensor) * getBytes(srcTensor);
+                mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), gpuSize,
+                                    MNNMemcpyDeviceToDevice, true);
+            } else {
+                int batch, plane, channel;
+                _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
+                PackInfo pack;
+                auto func = PackBuffer;
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
+                    func = PackBuffer;
+                } else if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
+                    func = UnpackBuffer;
+                } else {
+                    FUNC_PRINT(1);
+                }
+                func((void*)(dstTensor->deviceId()), (void*)(srcTensor->deviceId()), &pack, getBytes(srcTensor), mCUDARuntime.get());
+            }
+            break;
+        }
+        auto convertFunction = FuseRasterBlitFloatToFloat;
+        if (mUseFp16AsFp32) {
+            if (!srcDevice) {
+                convertFunction = FuseRasterBlitFloatToHalf;
+            } else {
+                convertFunction = FuseRasterBlitHalfToFloat;
+            }
+        }
+        if (srcTensor->dimensions() <= 1) {
+            size[2] = srcTensor->elementSize();
+            srcStride[2] = 1;
+            dstStride[2] = 1;
+            offset[0] = 1;
+            offset[1] = 1;
+            offset[2] = size[2];
+            offset[3] = 0;
+            offset[4] = 1;
+            offset[5] = 1;
+            offset[6] = size[2];
+            offset[7] = 0;
+            offsetNumber = 1;
        } else {
-            mCUDARuntime->memcpy(dstTensor->host<void>(), (void*)(srcTensor->deviceId()), needSize, MNNMemcpyDeviceToHost,
-                             true);
+            // Compute batch, plane, channel
+            int batch, plane, channel;
+            _computeBCA(batch, plane, channel, srcDimensionFormat, srcTensor);
+            if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDevice) {
+                PackInfo pack = _computePackInfo(srcDimensionFormat, batch, plane, channel);
+                if (mUseFp16AsFp32) {
+                    if (type.code == halide_type_float) {
+                        if (dstDevice) {
+                            PackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        } else {
+                            PackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        }
+                    }
+                } else {
+                    PackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
+                }
+                break;
+            }
+            if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && srcDevice) {
+                PackInfo pack = _computePackInfo(dstDimensionFormat, batch, plane, channel);
+                if (mUseFp16AsFp32) {
+                    if (type.code == halide_type_float) {
+                        if (dstDevice) {
+                            UnpackFP32ToFP16(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        } else {
+                            UnpackFP16ToFP32(dstPtr, srcPtr, &pack, mCUDARuntime.get());
+                            break;
+                        }
+                    }
+                } else {
+                    UnpackBuffer(dstPtr, srcPtr, &pack, bytes, mCUDARuntime.get());
+                }
+                break;
+            }
+            //MNN_PRINT("host/device: %d -> %d, format %d -> %d, b, p, c: %d - %d - %d\n", srcDevice, dstDevice, srcDimensionFormat, dstDimensionFormat, batch, plane, channel);
+            // Set region
+            if (srcDimensionFormat != MNN_DATA_FORMAT_NC4HW4 && dstDimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                size[0] = batch;
+                size[1] = channel;
+                size[2] = plane;
+                offsetNumber = 1;
+                offset[0] = batch;
+                offset[1] = channel;
+                offset[2] = plane;
+                offset[3] = 0;
+                offset[4] = batch;
+                offset[5] = channel;
+                offset[6] = plane;
+                offset[7] = 0;
+                if (srcDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+                    srcStride[0] = channel * plane;
+                    srcStride[1] = 1;
+                    srcStride[2] = channel;
+                } else {
+                    srcStride[0] = channel * plane;
+                    srcStride[1] = plane;
+                    srcStride[2] = 1;
+                }
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NHWC) {
+                    dstStride[0] = channel * plane;
+                    dstStride[1] = 1;
+                    dstStride[2] = channel;
+                } else {
+                    dstStride[0] = channel * plane;
+                    dstStride[1] = plane;
+                    dstStride[2] = 1;
+                }
+            } else {
+                offsetNumber = PACK_NUMBER;
+                size[0] = batch;
+                size[1] = UP_DIV(channel, PACK_NUMBER);
+                size[2] = plane;
+                int srcPack = 1;
+                int dstPack = 1;
+                int srcChannelLimit = channel;
+                if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    if (srcDevice) {
+                        srcPack = PACK_NUMBER;
+                        srcChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
+                    } else {
+                        srcPack = 4;
+                        srcChannelLimit = UP_DIV(channel, 4) * 4;
+                    }
+                }
+                int dstChannelLimit = channel;
+                if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                    if (dstDevice) {
+                        dstPack = PACK_NUMBER;
+                        dstChannelLimit = UP_DIV(channel, PACK_NUMBER) * PACK_NUMBER;
+                    } else {
+                        dstPack = 4;
+                        dstChannelLimit = UP_DIV(channel, 4) * 4;
+                    }
+                }
+                // Compute Stride
+                _computeStride(srcDimensionFormat, srcStride, batch, plane, channel, srcPack);
+                _computeStride(dstDimensionFormat, dstStride, batch, plane, channel, dstPack);
+
+                // Compute Offset
+                for (int i=0; i<offsetNumber; ++i) {
+                    auto offsetPtr = offset + i * 8;
+                    int channelStart = i;
+                    offsetPtr[0] = batch;
+                    offsetPtr[1] = (srcChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
+                    offsetPtr[2] = plane;
+                    if (srcDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                        int sp = i / srcPack;
+                        int sm = i % srcPack;
+                        offsetPtr[3] = sm + sp * srcPack * plane * batch;
+                    } else {
+                        offsetPtr[3] = channelStart * srcStride[1] / PACK_NUMBER;
+                    }
+
+                    offsetPtr[4] = batch;
+                    offsetPtr[5] = (dstChannelLimit + PACK_NUMBER - channelStart - 1) / PACK_NUMBER;
+                    offsetPtr[6] = plane;
+                    if (dstDimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                        int sp = i / dstPack;
+                        int sm = i % dstPack;
+                        offsetPtr[7] = sm + sp * dstPack * plane * batch;
+                    } else {
+                        offsetPtr[7] = channelStart * dstStride[1] / PACK_NUMBER;
+                    }
+                }
+            }
        }
+        reg.fuseNumber = offsetNumber;
+        mCUDARuntime->memcpy(regionGpu, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+        mCUDARuntime->memcpy(offsetGpu, offset, offsetNumber * 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+#ifdef MNN_CUDA_COPY_DEBUG
+        MNN_PRINT("Reg.size: %d - %d - %d\n", reg.size[0], reg.size[1], reg.size[2]);
+        MNN_PRINT("Reg.srcStride: %d - %d - %d\n", reg.srcStride[0], reg.srcStride[1], reg.srcStride[2]);
+        MNN_PRINT("Reg.dstStride: %d - %d - %d\n", reg.dstStride[0], reg.dstStride[1], reg.dstStride[2]);
+        MNN_PRINT("FuseNum: %d\n", reg.fuseNumber);
+        for (int i=0; i<reg.fuseNumber; ++i) {
+            auto off = offset + 8 * i;
+            MNN_PRINT("Src: %d, %d, %d, %d; dst:%d, %d, %d, %d\n", off[0], off[1], off[2], off[3], off[4], off[5], off[6], off[7]);
+        }
+#endif
+        if (mUseFp16AsFp32) {
+            if (type.code == halide_type_float) {
+                convertFunction(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get());
+                break;
+            }
+        }
+        FuseRasterBlitCommon(dstPtr, srcPtr, regionGpu, offsetGpu, mCUDARuntime.get(), type.bytes());
+    } while(false);
+    mStaticBufferPool->free(offsetGpuStorage);
+    mStaticBufferPool->free(regionStorage);
+    if (!srcDevice) {
+        mStaticBufferPool->free(tempSrcStorage);
    }
-    if (srcTensor->deviceId() == 0 && dstTensor->deviceId() != 0) {
-        if (srcDimensionFormat != dstDimensionFormat) {
-            srcTempTensor.reset(new Tensor(dstTensor, dstTensor->getDimensionType(), true));
-            MNNCPUCopyBuffer(srcTensor, srcTempTensor.get());
-            srcTensor = srcTempTensor.get();
-        }
-        mCUDARuntime->memcpy((void*)(dstTensor->deviceId()), srcTensor->host<void>(), needSize, MNNMemcpyHostToDevice,
+    if (!dstDevice) {
+        auto cpuSize = dstTensor->size();
+        mCUDARuntime->memcpy(dstTensor->host<void>(), dstPtr, cpuSize, MNNMemcpyDeviceToHost,
                             true);
+        mStaticBufferPool->free(tempDstStorage);        
    }
    return;
 }
--- a/source/backend/cuda/core/CUDABackend.hpp
+++ b/source/backend/cuda/core/CUDABackend.hpp
@ -17,6 +17,7 @@
 #include "core/Macro.h"
 #include "core/ConvolutionCommon.hpp"
 #include "core/BufferAllocator.hpp"
+#include "backend/cpu/CPUResizeCache.hpp"
 namespace MNN {
 namespace CUDA {
 class MNN_PUBLIC CUDARuntimeWrapper : public Runtime {
@ -37,11 +38,12 @@ private:
    std::shared_ptr<BufferAllocator> mBufferPool;
    std::shared_ptr<CUDARuntime> mCUDARuntime; 
    bool mIsCreateError{false};
+    BackendConfig::PrecisionMode mDefaultPrecision;
 };

 class CUDABackend : public Backend {
 public:
-    CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt);
+    CUDABackend(std::shared_ptr<BufferAllocator> st, std::shared_ptr<CUDARuntime> rt, bool useFp16AsFp32);
    ~CUDABackend();

    CUDARuntime *getCUDARuntime();
@ -74,11 +76,15 @@ public:
        return mStaticBufferPool.get();
    }
    static size_t realSize(const Tensor *tensor);
-
+    int getBytes(const Tensor* tensor) const;
+    CPUResizeCache* getCache();
+    bool useFp16() const;
 private:
    std::shared_ptr<BufferAllocator> mBufferPool;
    std::shared_ptr<BufferAllocator> mStaticBufferPool;
    std::shared_ptr<CUDARuntime> mCUDARuntime;
+    CPUResizeCache mCache;
+    bool mUseFp16AsFp32 = false;
 };

 template <class T>
--- a/source/backend/cuda/core/runtime/CUDARuntime.cpp
+++ b/source/backend/cuda/core/runtime/CUDARuntime.cpp
@ -15,17 +15,11 @@
 #include <utility>
 #include <vector>
 #include "core/Macro.h"
+// #define MNN_CUDA_USE_BLAS
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
-// #define LOG_VERBOSE
-#define CUDNN_VERSION_STR STR(CUDNN_MAJOR) "." STR(CUDNN_MINOR) "." STR(CUDNN_PATCHLEVEL)
-
-#pragma message "compile with cuda " STR(CUDART_VERSION) " "
-#pragma message "compile with cuDNN " CUDNN_VERSION_STR " "
-
-static_assert(!(CUDNN_MAJOR == 5 && CUDNN_MINOR == 1), "cuDNN 5.1.x series has bugs. Use 5.0.x instead.");

 #undef STR
 #undef STR_HELPER
@ -36,7 +30,7 @@ bool CUDARuntime::isCreateError() const {
    return mIsCreateError;
 }

-CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
+CUDARuntime::CUDARuntime(int device_id) {
 #ifdef LOG_VERBOSE
    MNN_PRINT("start CUDARuntime !\n");
 #endif
@ -49,42 +43,39 @@ CUDARuntime::CUDARuntime(bool permitFloat16, int device_id) {
    mDeviceId = id;
    cuda_check(cudaGetDeviceProperties(&mProp, id));
    MNN_ASSERT(mProp.maxThreadsPerBlock > 0);
-
+#ifdef MNN_CUDA_USE_BLAS
    cublas_check(cublasCreate(&mCublasHandle));
-
-    // Set stream for cuDNN and cublas handles.
-
-    // Note that all cublas scalars (alpha, beta) and scalar results such as dot
-    // output resides at device side.
    cublas_check(cublasSetPointerMode(mCublasHandle, CUBLAS_POINTER_MODE_HOST));
-    cudnn_check(cudnnCreate(&mCudnnHandle));
+#endif
 }

 CUDARuntime::~CUDARuntime() {
 #ifdef LOG_VERBOSE
    MNN_PRINT("start ~CUDARuntime !\n");
 #endif
+#ifdef MNN_CUDA_USE_BLAS
    cublas_check(cublasDestroy(mCublasHandle));
-    cudnn_check(cudnnDestroy(mCudnnHandle));
-
+#endif
 #ifdef LOG_VERBOSE
    MNN_PRINT("end ~CUDARuntime !\n");
 #endif
 }

-int CUDARuntime::blocks_num(const int total_threads) {
-    int maxNum = mProp.maxThreadsPerBlock;
-    if(total_threads / 32 > maxNum) {
-        mThreadPerBlock = maxNum;
-    } else if(total_threads / 16 > maxNum) {
-        mThreadPerBlock = maxNum / 2;
-    } else if(total_threads / 8 > maxNum) {
-        mThreadPerBlock = maxNum / 4;
-    } else if(total_threads / 4 > maxNum) {
-        mThreadPerBlock = maxNum / 8;
-    } else {
-        mThreadPerBlock = 128;
-    }
+size_t CUDARuntime::blocks_num(const size_t total_threads) {
+    // size_t maxNum = mProp.maxThreadsPerBlock;
+    // if(total_threads / 32 > maxNum) {
+    //     mThreadPerBlock = maxNum;
+    // } else if(total_threads / 16 > maxNum) {
+    //     mThreadPerBlock = maxNum / 2;
+    // } else if(total_threads / 8 > maxNum) {
+    //     mThreadPerBlock = maxNum / 4;
+    // } else if(total_threads / 4 > maxNum) {
+    //     mThreadPerBlock = maxNum / 8;
+    // } else {
+    //     mThreadPerBlock = 128;
+    // }
+    
+    mThreadPerBlock = 128;
    return (total_threads + mThreadPerBlock - 1) / mThreadPerBlock;
 }

@ -148,13 +139,4 @@ void CUDARuntime::memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMe
 void CUDARuntime::memset(void *dst, int value, size_t size_in_bytes) {
    cuda_check(cudaMemset(dst, value, size_in_bytes));
 }
-
-cublasHandle_t CUDARuntime::cublas_handle() {
-    return mCublasHandle;
-}
-
-cudnnHandle_t CUDARuntime::cudnn_handle() {
-    return mCudnnHandle;
-}
-
 } // namespace MNN
--- a/source/backend/cuda/core/runtime/CUDARuntime.hpp
+++ b/source/backend/cuda/core/runtime/CUDARuntime.hpp
@ -16,19 +16,14 @@
 #include <string>
 #include <vector>

-#include <cublas_v2.h>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
-#include <cudnn.h>
 #include <cusolverDn.h>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "Type_generated.h"
 #include "core/Macro.h"
-#if CUDA_VERSION >= 10010
-#include <cublasLt.h>
-#endif

 typedef enum {
    CUDA_FLOAT32 = 0,
@ -49,40 +44,30 @@ typedef enum {
        }                          \
    } while (0)

-#define cublas_check(_x)                     \
-    do {                                     \
-        cublasStatus_t _err = (_x);          \
-        if (_err != CUBLAS_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);            \
-        }                                    \
-    } while (0)
-
-#define cudnn_check(_x)                     \
-    do {                                    \
-        cudnnStatus_t _err = (_x);          \
-        if (_err != CUDNN_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);           \
-        }                                   \
-    } while (0)
-
-#define cusolver_check(_x)                     \
-    do {                                       \
-        cusolverStatus_t _err = (_x);          \
-        if (_err != CUSOLVER_STATUS_SUCCESS) { \
-            MNN_CHECK(_err, #_x);              \
-        }                                      \
-    } while (0)
-
 #define after_kernel_launch()           \
    do {                                \
        cuda_check(cudaGetLastError()); \
    } while (0)

+#ifdef DEBUG
+#define checkKernelErrors\
+  do {                                                      \
+    cudaError_t __err = cudaGetLastError();                 \
+    if (__err != cudaSuccess) {                             \
+      printf("File:%s Line %d: failed: %s\n", __FILE__, __LINE__,\
+             cudaGetErrorString(__err));                    \
+      abort();                                              \
+    }                                                       \
+  } while (0)
+#else
+#define checkKernelErrors
+#endif
+
 namespace MNN {

 class CUDARuntime {
 public:
-    CUDARuntime(bool permitFloat16, int device_id);
+    CUDARuntime(int device_id);
    ~CUDARuntime();
    CUDARuntime(const CUDARuntime &) = delete;
    CUDARuntime &operator=(const CUDARuntime &) = delete;
@ -105,16 +90,14 @@ public:

    void memcpy(void *dst, const void *src, size_t size_in_bytes, MNNMemcpyKind_t kind, bool sync = false);
    void memset(void *dst, int value, size_t size_in_bytes);
-    cublasHandle_t cublas_handle();
-    cudnnHandle_t cudnn_handle();

-    int threads_num() {
+    size_t threads_num() {
        return mThreadPerBlock;
    }
    int major_sm() const {
        return mProp.major;
    }
-    int blocks_num(const int total_threads);
+    size_t blocks_num(const size_t total_threads);
    const cudaDeviceProp& prop() const {
        return mProp;
    }
@ -123,15 +106,12 @@ private:
    cudaDeviceProp mProp;
    int mDeviceId;

-    cublasHandle_t mCublasHandle;
-    cudnnHandle_t mCudnnHandle;
-
    bool mIsSupportedFP16   = false;
    bool mSupportDotInt8    = false;
    bool mSupportDotAccInt8 = false;
    float mFlops            = 4.0f;
    bool mIsCreateError{false};
-    int mThreadPerBlock = 128;
+    size_t mThreadPerBlock = 128;
 };

 } // namespace MNN
--- a/source/backend/cuda/execution/BatchMatMulExecution.cu
+++ b/source/backend/cuda/execution/BatchMatMulExecution.cu
@ -1,119 +0,0 @@
-#include "BatchMatMulExecution.hpp"
-namespace MNN {
-namespace CUDA {
-
-template <typename T>
-__global__ void add_bias(T *input, T *output, const T* bias, int batch, int e, int h) {
-    for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < batch * e * h; index += blockDim.x * gridDim.x) {
-        int i = index % (e*h);
-        int b = index / (e*h);
-        int y = i % h;
-        output[index] = input[index] + bias[b * h + y];
-    }
-    return;
-}
-BatchMatMulExecution::BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend) : Execution(backend) {
-    mTransposeA = transposeA;
-    mTransposeB = transposeB;
-}
-BatchMatMulExecution::~ BatchMatMulExecution() {
-    // do nothing
-}
-
-ErrorCode BatchMatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto C = outputs[0];
-
-    auto dimensions = C->dimensions();
-    int batch = 1;
-    for (int i = 0; i < dimensions - 2; ++i) {
-        batch *= C->length(i);
-    }
-    auto e = C->length(dimensions-2);
-    auto h = C->length(dimensions-1);
-    if(inputs.size() > 2) {
-        mTempOutput.reset(Tensor::createDevice<float>({batch*h*e}));
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
-    }
-    return NO_ERROR;
-}
-
-ErrorCode BatchMatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    auto blasHandle = runtime->cublas_handle();
-    const Tensor* A = inputs[0];
-    const Tensor* B = inputs[1];
-
-    auto dimensions = A->dimensions();
-    int batch = 1;
-    for (int i = 0; i < dimensions - 2; ++i) {
-        batch *= A->length(i);
-    }
-
-    auto w0         = inputs[0]->length(dimensions-1);
-    auto h0         = inputs[0]->length(dimensions-2);
-    auto C = outputs[0];
-
-    auto e = C->length(dimensions-2);
-    auto h = C->length(dimensions-1);
-    auto l = w0;
-    if (mTransposeA) {
-        l = h0;
-    }
-    auto APtr = (const float*)A->deviceId();
-    auto BPtr = (const float*)B->deviceId();
-    auto CDestPtr = (float*)C->deviceId();
-
-    float alpha = 1.0f;
-    float beta = 0.0f;
-
-    auto tranB = CUBLAS_OP_N;
-    auto ldB = h;
-    if (mTransposeB) {
-        ldB = l;
-        tranB = CUBLAS_OP_T;
-    }
-    auto tranA = CUBLAS_OP_N;
-    auto ldA = l;
-    if (mTransposeA) {
-        ldA = e;
-        tranA = CUBLAS_OP_T;
-    }
-
-    // [b, e, l] x [b, l, h] -> [b, e, h]
-    if(inputs.size() == 2) {    
-        auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CDestPtr, h, e*h, batch);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-
-    } else {
-        auto CPtr = (float*)mTempOutput->deviceId();
-        auto status = cublasSgemmStridedBatched(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, l*h, APtr, ldA, e*l, &beta, CPtr, h, e*h, batch);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-
-        //add bias: [b, e, h] + [b, h] -> [b, e, h]
-        int block_num = runtime->blocks_num(batch*e*h);
-        int threads_num = runtime->threads_num();
-        add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), batch, e, h);
-    }
-
-    return NO_ERROR;
-}
-
-class BatchMatMulCreator : public CUDABackend::Creator {
-public:
-    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
-        auto param = op->main_as_BatchMatMulParam();
-        return new BatchMatMulExecution(param->adjX(), param->adjY(), backend);
-    }
-};
-
-static CUDACreatorRegister<BatchMatMulCreator> __init(OpType_BatchMatMul);
-
-}
-}
--- a/source/backend/cuda/execution/BatchMatMulExecution.hpp
+++ b/source/backend/cuda/execution/BatchMatMulExecution.hpp
@ -1,23 +0,0 @@
-#ifndef BatchMatMulExecution_hpp
-#define BatchMatMulExecution_hpp
-#include <vector>
-#include "backend/cuda/core/CUDABackend.hpp"
-#include "core/Execution.hpp"
-namespace MNN {
-namespace CUDA {
-class BatchMatMulExecution : public Execution {
-public:
-    BatchMatMulExecution(bool transposeA, bool transposeB, Backend *backend);
-    virtual ~BatchMatMulExecution();
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
-private:
-    std::shared_ptr<Tensor> mTempOutput;
-    bool mTransposeA;
-    bool mTransposeB;
-};
-} // namespace CUDA
-} // namespace MNN
-
-#endif
--- a/source/backend/cuda/execution/BinaryExecution.cu
+++ b/source/backend/cuda/execution/BinaryExecution.cu
@ -50,11 +50,16 @@ ErrorCode BinaryExecution::onExecute(const std::vector<Tensor *> &inputs, const
    int stride0[3] = {0, 0, s0};
    int stride1[3] = {0, 0, s1};
    int stride2[3] = {0, 0, 1};
+    auto type = outputs[0]->getType();
+    if (type.code == halide_type_float) {
+        // Use Half or float
+        type.bits = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]) * 8;
+    }
    auto computeFunction = [&](Tensor* input0T, Tensor* input1T, Tensor* outputT) {
        auto input0 = (uint8_t*)input0T->deviceId();
        auto input1 = (uint8_t*)input1T->deviceId();
        auto output = (uint8_t*)outputT->deviceId();
-        BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, outputT->getType(), runtime, mType);
+        BinaryBlit(output, input0, input1, size, stride0, stride1, stride2, type, runtime, mType);
    };
    computeFunction(inputs[0], inputs[1], outputs[0]);
    for (int i=2; i<inputs.size(); ++i) {
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.cu
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.cu
@ -1,61 +1,324 @@
 #include "ConvDepthWiseExecution.hpp"
 #include "core/ConvolutionCommon.hpp"
+#include "Raster.cuh"
+#include <float.h>
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {
-struct constBuffer {
-    int pad[2];
-    int kernelSize[2];
-    int stride[2];
-    int dilate[2];
-    int inputSize[2];
-    int outputSize[2];
-    int channel;
-    int subChannel;
-    int total;
-    int activationType;
-} uConstant;
+#define PACK_NUMBER_C2 (PACK_NUMBER/2)

-ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn) : Execution(bn) {
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#define MNN_CUDA_HALF2_MIN(a, b)                     \
+    do {                                             \
+        (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+
+__global__ void CONV_DW_HALF(const half2* input, const half2* kernel, const half2* bias, half2 *output, const constBuffer* uConstant) {
+    half2 maxV = half2(uConstant->maxValue, uConstant->maxValue);
+    half2 minV = half2(uConstant->minValue, uConstant->minValue);
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int c = uConstant->channel;
+    int ow = uConstant->outputSize[0];
+    int oh = uConstant->outputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int dw = uConstant->dilate[0];
+    int dh = uConstant->dilate[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index / PACK_NUMBER_C2;
+        int zR = index % PACK_NUMBER_C2;
+        int oz = i / (ow * oh);
+        int tmp = i % (ow * oh);
+        int oy = tmp / ow;
+        int ox = tmp % ow;
+        int kz = oz / uConstant->batch;
+        
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        half2 color = bias[kz * PACK_NUMBER_C2 + zR];
+        int fxSta = max(0, (UP_DIV(-ix, dw)));
+        int fySta = max(0, (UP_DIV(-iy, dh)));
+        int fxEnd = min(kw, UP_DIV(iw - ix, dw));
+        int fyEnd = min(kh, UP_DIV(ih - iy, dh));
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy*dh + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx*dw + ix;
+                half2 inp = input[0
+                    + sx * PACK_NUMBER_C2
+                    + sy * iw * PACK_NUMBER_C2
+                    + oz * iw * ih * PACK_NUMBER_C2
+                    + zR
+                ];
+                half2 ker = kernel[0
+                    + fx * PACK_NUMBER_C2
+                    + fy * kw * PACK_NUMBER_C2
+                    + kz * kw * kh * PACK_NUMBER_C2
+                    + zR
+                ];
+                color = __hfma2(inp, ker, color);
+            }
+        }
+        MNN_CUDA_HALF2_MAX(color, minV);
+        MNN_CUDA_HALF2_MIN(color, maxV);
+
+        output[0
+            + zR
+            + ox * PACK_NUMBER_C2
+            + oy * ow * PACK_NUMBER_C2
+            + oz * ow * oh * PACK_NUMBER_C2
+        ] = color;
+    }
+}
+
+
+__global__ void CONV_DW(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant) {
+    float maxV = uConstant->maxValue;
+    float minV = uConstant->minValue;
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int c = uConstant->channel;
+    int ow = uConstant->outputSize[0];
+    int oh = uConstant->outputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int dw = uConstant->dilate[0];
+    int dh = uConstant->dilate[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index / PACK_NUMBER;
+        int zR = index % PACK_NUMBER;
+        int oz = i / (ow * oh);
+        int tmp = i % (ow * oh);
+        int oy = tmp / ow;
+        int ox = tmp % ow;
+        int kz = oz / uConstant->batch;
+        
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        float color = bias[kz * PACK_NUMBER + zR];
+        int fxSta = max(0, (UP_DIV(-ix, dw)));
+        int fySta = max(0, (UP_DIV(-iy, dh)));
+        int fxEnd = min(kw, UP_DIV(iw - ix, dw));
+        int fyEnd = min(kh, UP_DIV(ih - iy, dh));
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy*dh + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx*dw + ix;
+                float inp = input[0
+                    + sx * PACK_NUMBER
+                    + sy * iw * PACK_NUMBER
+                    + oz * iw * ih * PACK_NUMBER
+                    + zR
+                ];
+                float ker = kernel[0
+                    + fx * PACK_NUMBER
+                    + fy * kw * PACK_NUMBER
+                    + kz * kw * kh * PACK_NUMBER
+                    + zR
+                ];
+                color = color + inp * ker;
+            }
+        }
+        color = max(color, minV);
+        color = min(color, maxV);
+
+        output[0
+            + zR
+            + ox * PACK_NUMBER
+            + oy * ow * PACK_NUMBER
+            + oz * ow * oh * PACK_NUMBER
+        ] = color;
+    }
+}
+
+
+__global__ void CONV_DW_OPT(const float* input, const half* kernel, const half* bias, float *output, const constBuffer* uConstant,
+    DivModFast d_owh,
+    DivModFast d_ow,
+    DivModFast d_ob
+    ) {
+    float maxV = uConstant->maxValue;
+    float minV = uConstant->minValue;
+    int iw = uConstant->inputSize[0];
+    int ih = uConstant->inputSize[1];
+    int kw = uConstant->kernelSize[0];
+    int kh = uConstant->kernelSize[1];
+    int sw = uConstant->stride[0];
+    int sh = uConstant->stride[1];
+    int pw = uConstant->pad[0];
+    int ph = uConstant->pad[1];
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < uConstant->total; index += blockDim.x * gridDim.x) {
+        int i = index >> 4;
+        int zR = index & 15;
+        int oz, tmp, oy, ox, kz, unuse;
+        d_owh.divmod(i, oz, tmp);
+        d_ow.divmod(tmp, oy, ox);
+        d_ob.divmod(oz, kz, unuse);
+
+        int ix = ox * sw - pw;
+        int iy = oy * sh - ph;
+        float color = bias[(kz << 4) + zR];
+        int fxSta = max(0, -ix);
+        int fySta = max(0, -iy);
+        int fxEnd = min(kw, iw - ix);
+        int fyEnd = min(kh, ih - iy);
+        int fx, fy, fz;
+        for (fy=fySta; fy<fyEnd; ++fy) {
+            int sy = fy + iy;
+            for (fx=fxSta; fx<fxEnd; ++fx) {
+                int sx = fx + ix;
+                float inp = input[0
+                    + ((sx + iw * (sy + oz * ih)) << 4)
+                    + zR
+                ];
+                float ker = kernel[0
+                    + ((fx + kw * (fy + kz * kh)) << 4)
+                    + zR
+                ];
+                color = color + inp * ker;
+            }
+        }
+        color = max(color, minV);
+        color = min(color, maxV);
+
+        output[index] = color;
+    }
+    return;
+}
+
+static std::shared_ptr<ConvDepthWiseExecution::Resource> _makeResource(const Op* op, Backend* bn) {
+    std::shared_ptr<ConvDepthWiseExecution::Resource> res(new ConvDepthWiseExecution::Resource);
+    auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+    auto conv = op->main_as_Convolution2D();
+    auto convCommon = conv->common();
+    int kernelX = convCommon->kernelX();
+    int kernelY = convCommon->kernelY();
+    int depth = convCommon->outputCount();
+    int depthC = UP_DIV(depth, PACK_NUMBER);
+    res->weightTensor.reset(Tensor::createDevice<float>({kernelX * kernelY * depthC * PACK_NUMBER}));
+    bool success = bn->onAcquireBuffer(res->weightTensor.get(), Backend::STATIC);
+    if (!success) {
+        return nullptr;
+    }
+    res->mFilter = (void *)res->weightTensor.get()->buffer().device;
+    FuseRegion reg;
+    int offset[8 * PACK_NUMBER];
+    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
+    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(offset));
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+    //weight host->device
+    const float* filterDataPtr = nullptr;
+    int weightSize = 0;
+    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
+    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
+    auto tempWeightStorage = pool->alloc(weightSize * sizeof(float));
+    auto tempWeight = (uint8_t*)tempWeightStorage.first + tempWeightStorage.second;
+    cuda_check(cudaMemcpy(tempWeight, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
+    reg.size[0] = 1;
+    reg.size[1] = depthC;
+    reg.size[2] = kernelX * kernelY;
+    reg.srcStride[0] = 0;
+    reg.srcStride[1] = PACK_NUMBER * kernelX * kernelY;
+    reg.srcStride[2] = 1;
+    reg.dstStride[0] = 0;
+    reg.dstStride[1] = kernelX * kernelY * PACK_NUMBER;
+    reg.dstStride[2] = PACK_NUMBER;
+    reg.fuseNumber = PACK_NUMBER;
+    for (int v=0; v<PACK_NUMBER; ++v) {
+        auto off = offset + 8 * v;
+        // Src
+        off[0] = 1;
+        off[1] = (depth + PACK_NUMBER - v - 1) / PACK_NUMBER;
+        off[2] = reg.size[2];
+        off[3] = v * kernelX * kernelY;
+        // Dst
+        off[4] = 1;
+        off[5] = depthC;
+        off[6] = reg.size[2];
+        off[7] = v;
+    }
+    runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+    runtime->memcpy(offsetGpu, offset, 8 * PACK_NUMBER * sizeof(int), MNNMemcpyHostToDevice, true);
+    FuseRasterBlitFloatToHalf((uint8_t*)res->mFilter, (uint8_t*)tempWeight, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    pool->free(tempWeightStorage);
+    res->biasTensor.reset(Tensor::createDevice<float>({depthC * PACK_NUMBER}));
+    success = bn->onAcquireBuffer(res->biasTensor.get(), Backend::STATIC);
+    res->mBias = (void *)res->biasTensor.get()->buffer().device;
+    if (!success) {
+        return nullptr;
+    }
+    if(conv->bias() != nullptr) {
+        auto tempBiasStorage = pool->alloc(depth * sizeof(float));
+        auto tempBias = (uint8_t*)tempBiasStorage.first + tempBiasStorage.second;
+        cuda_check(cudaMemcpy(tempBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+        reg.size[0] = 1;
+        reg.size[1] = 1;
+        reg.size[2] = depthC * PACK_NUMBER;
+        reg.srcStride[0] = 0;
+        reg.srcStride[1] = 0;
+        reg.srcStride[2] = 1;
+        reg.dstStride[0] = 0;
+        reg.dstStride[1] = 0;
+        reg.dstStride[2] = 1;
+        offset[0] = 1;
+        offset[1] = 1;
+        offset[2] = conv->bias()->size();
+        offset[3] = 0;
+        offset[4] = 1;
+        offset[5] = 1;
+        offset[6] = reg.size[2];
+        offset[7] = 0;
+        reg.fuseNumber = 1;
+        runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+        runtime->memcpy(offsetGpu, offset, 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+        FuseRasterBlitFloatToHalf((uint8_t*)res->mBias, (uint8_t*)tempBias, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+        pool->free(tempBiasStorage);
+    }
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
+    return res;
+}
+
+ConvDepthWiseExecution::ConvDepthWiseExecution(const Op* op, Backend* bn, std::shared_ptr<Resource> resource) : Execution(bn) {
    mOp = op;
+    mResource = resource;
    auto pool = static_cast<CUDABackend*>(bn)->getStaticBufferPool();
    mConstBuffer = pool->alloc(sizeof(constBuffer));
-
-    auto conv = mOp->main_as_Convolution2D();
-    //weight host->device
-    if(nullptr != conv->weight()) {
-        int weightSize = conv->weight()->size();
-        weightTensor.reset(Tensor::createDevice<float>({weightSize}));
-        backend()->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
-        mFilter = (void *)weightTensor.get()->buffer().device;
-        cuda_check(cudaMemcpy(mFilter, conv->weight()->data(), conv->weight()->size()*sizeof(float), cudaMemcpyHostToDevice));
-
-        mBias = nullptr;
-        if(conv->bias()->size() != 0) {
-            int biasSize = conv->bias()->size();
-            biasTensor.reset(Tensor::createDevice<float>({biasSize}));
-            backend()->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-            mBias = (void *)biasTensor.get()->buffer().device;
-            cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-            use_bias_ = true;
-        }
-    }
 }
 ConvDepthWiseExecution::~ ConvDepthWiseExecution() {
    auto pool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
    pool->free(mConstBuffer);
-    if (nullptr != weightTensor) {
-        backend()->onReleaseBuffer(weightTensor.get(), Backend::STATIC);
-    }
-    if(use_bias_ && nullptr != biasTensor) {
-        backend()->onReleaseBuffer(biasTensor.get(), Backend::STATIC);
-    }
 }

 ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto pad = ConvolutionCommon::convolutionPad(inputs[0], outputs[0], mOp->main_as_Convolution2D()->common());
    auto conv = mOp->main_as_Convolution2D();
    auto convCommon = mOp->main_as_Convolution2D()->common();
-    constBuffer parameters;
+    int channel = inputs[0]->channel();
+    int channelDiv = UP_DIV(channel, PACK_NUMBER);
    parameters.pad[0] = pad.first;
    parameters.pad[1] = pad.second;
    parameters.kernelSize[0] = convCommon->kernelX();
@ -66,233 +329,82 @@ ErrorCode ConvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs,
    parameters.dilate[1] = convCommon->dilateY();
    parameters.inputSize[0] = inputs[0]->width();
    parameters.inputSize[1] = inputs[0]->height();
-    parameters.channel = inputs[0]->batch() * inputs[0]->channel();
+    parameters.channel = inputs[0]->batch() * channelDiv;
    parameters.outputSize[0] = outputs[0]->width();
    parameters.outputSize[1] = outputs[0]->height();
-    parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
-    parameters.subChannel = inputs[0]->channel();
-    parameters.activationType = convCommon->relu() ? 1 : (convCommon->relu6() ? 2 : 0);
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER_C2;
+    } else {
+        parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0] * PACK_NUMBER;
+        parameters.minValue = -FLT_MAX;
+        parameters.maxValue = FLT_MAX;
+    }
+    parameters.batch = inputs[0]->batch();
+    if (convCommon->relu()) {
+        parameters.minValue = 0.0f;
+    }
+    if (convCommon->relu6()) {
+        parameters.minValue = 0.0f;
+        parameters.maxValue = 6.0f;
+    }

    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    runtime->memcpy((uint8_t*)mConstBuffer.first + mConstBuffer.second, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
    mTotalCount = parameters.total;
-
+    //printf("%d-%d-%d-%d, %d-%d-%d-%d-%d\n", parameters.kernelSize[0], parameters.kernelSize[1], parameters.stride[0], parameters.stride[1], parameters.inputSize[0], parameters.inputSize[1], channel, parameters.outputSize[0], parameters.outputSize[1]);
    return NO_ERROR;
 }

-__global__ void CONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
-        {
-            int iw = uConstant->inputSize[0];
-            int ih = uConstant->inputSize[1];
-            int c = uConstant->channel;
-            int ow = uConstant->outputSize[0];
-            int oh = uConstant->outputSize[1];
-            int kw = uConstant->kernelSize[0];
-            int kh = uConstant->kernelSize[1];
-            int dw = uConstant->dilate[0];
-            int dh = uConstant->dilate[1];
-            int sw = uConstant->stride[0];
-            int sh = uConstant->stride[1];
-            int pw = uConstant->pad[0];
-            int ph = uConstant->pad[1];
-            int acttype = uConstant->activationType;
-
-            int oz = i / (ow * oh);
-            int tmp = i % (ow * oh);
-            int oy = tmp / ow;
-            int ox = tmp % ow;
-            int kz = oz % uConstant->subChannel;
-            
-            int ix = ox * sw - pw;
-            int iy = oy * sh - ph;
-            float color = 0.0;
-            if (bias != nullptr) {
-                color = bias[kz];
-            }
-
-            int fx, fy, fz;
-            for (fy=0; fy<kh; ++fy) {
-                int sy = fy*dh + iy;
-                if (sy >= ih || sy < 0) {
-                    continue;
-                }
-                for (fx=0; fx<kw; ++fx) {
-                    int sx = fx*dw + ix;
-                    if (sx >= iw || sx < 0) {
-                        continue;
-                    }
-                    float inputValue = input[0
-                        + sx
-                        + sy * iw
-                        + oz * iw * ih
-                    ];
-                    float k = kernel[0
-                        + fx
-                        + fy * kw
-                        + kz * kw * kh
-                    ];
-                    color  += k*inputValue;
-                }
-            }
-            color = (acttype==1) ? max(0.0, color) : (acttype==2 ? (min(max(0.0, color), 6.0)) : color);
-            output[0
-                + ox
-                + oy * ow
-                + oz * ow * oh
-            ] = color;
-        }
-    }
-    return;
-}
-
-
 ErrorCode ConvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    auto& prop = runtime->prop();
-    int threads_num = prop.maxThreadsPerBlock;
+    int limitThreads = UP_DIV(mTotalCount, prop.multiProcessorCount);
+    int threads_num = ALIMIN(prop.maxThreadsPerBlock, limitThreads);
    int block_num = prop.multiProcessorCount;
    auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-    if (inputs.size() == 1) {
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)mFilter,
-             (const float*)mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
-    } else if (inputs.size() == 3) {
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    } else {
-        MNN_ASSERT(inputs.size() == 2);
-        CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if (inputs.size() == 1) {
+            CONV_DW_HALF<<<block_num, threads_num>>>((const half2*)inputs[0]->deviceId(), (const half2*)mResource->mFilter,
+                (const half2*)mResource->mBias, (half2*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
+        }
+        return NO_ERROR;
    }
-    return NO_ERROR;
-}

-
-
-__global__ void DECONV_DW(const float* input, const float* kernel, const float* bias, float *output, const constBuffer* uConstant) {
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < uConstant->total; i += blockDim.x * gridDim.x) {
-        {
-            int iw = uConstant->inputSize[0];
-            int ih = uConstant->inputSize[1];
-            int c = uConstant->channel;
-            int ow = uConstant->outputSize[0];
-            int oh = uConstant->outputSize[1];
-            int kw = uConstant->kernelSize[0];
-            int kh = uConstant->kernelSize[1];
-            int dw = uConstant->dilate[0];
-            int dh = uConstant->dilate[1];
-            int sw = uConstant->stride[0];
-            int sh = uConstant->stride[1];
-            int pw = uConstant->pad[0];
-            int ph = uConstant->pad[1];
-
-            int oz = i / (ow * oh);
-            int tmp = i % (ow * oh);
-            int oy = tmp / ow;
-            int ox = tmp % ow;
-            int kz = oz % uConstant->subChannel;
+    if (inputs.size() == 1) {
+        // block_num = runtime->blocks_num(mTotalCount);
+        // threads_num = runtime->threads_num();
+        if(parameters.dilate[0] == 1 && parameters.dilate[1] == 1) {
+            const int area = parameters.outputSize[0] * parameters.outputSize[1];
+            DivModFast d_owh(area);
+            DivModFast d_ow(parameters.outputSize[0]);
+            DivModFast d_ob(outputs[0]->batch());
            
-            int ix = ox + pw;
-            int iy = oy + ph;
-            float color = 0.0;
-            if (bias != nullptr) {
-                color = bias[kz];
-            }
-
-            int fx, fy, fz;
-            for (fy=0; fy<kh; ++fy) {
-                int sy = iy - fy*dh;
-                int y = sy / sh;
-                if (sy % sh == 0 && y >= 0 && y < ih) {
-                    for (int fx=0; fx<kw; ++fx) {
-                        int sx = ix - fx*dw;
-                        int x = sx / sw;
-                        if (sx % sw == 0 && x >= 0 && x < iw) {
-                            float inputValue = input[0
-                                + x
-                                + y * iw
-                                + oz * iw * ih
-                            ];
-                            float k = kernel[0
-                                + fx
-                                + fy * kw
-                                + kz * kw * kh
-                            ];
-                            color  += k*inputValue;                            
-                        }
-                    }
-                }
-            }
-            output[0
-                + ox
-                + oy * ow
-                + oz * ow * oh
-            ] = color;
+            CONV_DW_OPT<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
+                (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr),
+                d_owh, d_ow, d_ob);
+        } else {
+            CONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const half*)mResource->mFilter,
+                (const half*)mResource->mBias, (float*)outputs[0]->deviceId(), (const constBuffer*)(constPtr));
        }
    }
-    return;
-}
-
-
-ErrorCode DeconvDepthWiseExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto convCommon = mOp->main_as_Convolution2D()->common();
-    auto pad = ConvolutionCommon::convolutionTransposePad(inputs[0], outputs[0], convCommon);
-    constBuffer parameters;
-    parameters.pad[0] = pad.first;
-    parameters.pad[1] = pad.second;
-    parameters.kernelSize[0] = convCommon->kernelX();
-    parameters.kernelSize[1] = convCommon->kernelY();
-    parameters.stride[0] = convCommon->strideX();
-    parameters.stride[1] = convCommon->strideY();
-    parameters.dilate[0] = convCommon->dilateX();
-    parameters.dilate[1] = convCommon->dilateY();
-    parameters.inputSize[0] = inputs[0]->width();
-    parameters.inputSize[1] = inputs[0]->height();
-    parameters.channel = inputs[0]->batch() * inputs[0]->channel();
-    parameters.outputSize[0] = outputs[0]->width();
-    parameters.outputSize[1] = outputs[0]->height();
-    parameters.total = parameters.channel * parameters.outputSize[1] * parameters.outputSize[0];
-    parameters.subChannel = inputs[0]->channel();
-    auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    runtime->memcpy(constPtr, &parameters, sizeof(constBuffer), MNNMemcpyHostToDevice);
-    mTotalCount = parameters.total;
    return NO_ERROR;
 }

-ErrorCode DeconvDepthWiseExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    int block_num = runtime->blocks_num(mTotalCount);
-    int threads_num = runtime->threads_num();
-    auto constPtr = (uint8_t*)mConstBuffer.first + mConstBuffer.second;
-    if (inputs.size() > 2) {
-        DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             (const float*)inputs[2]->deviceId(), (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    } else {
-        DECONV_DW<<<block_num, threads_num>>>((const float*)inputs[0]->deviceId(), (const float*)inputs[1]->deviceId(),
-             nullptr, (float*)outputs[0]->deviceId(), (const constBuffer*)constPtr);
-    }
-    return NO_ERROR;
-}
-
-
 class ConvDepthWiseExecutionCreator : public CUDABackend::Creator {
 public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const override {
-        if (OpType_ConvolutionDepthwise == op->type()) {
-            return new ConvDepthWiseExecution(op, backend);
-        }
-        if (inputs.size() == 1) {
-            MNN_PRINT("deconv depthwise not support 1 input yet\n");
+        if (inputs.size() > 1) {
            return nullptr;
        }
-        return new DeconvDepthWiseExecution(op, backend);
+        auto res = _makeResource(op, backend);
+        if (nullptr == res) {
+            return nullptr;
+        }
+        return new ConvDepthWiseExecution(op, backend, res);
    }
 };

 static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init(OpType_ConvolutionDepthwise);
-static CUDACreatorRegister<ConvDepthWiseExecutionCreator> __init2(OpType_DeconvolutionDepthwise);
 }
 }
--- a/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
+++ b/source/backend/cuda/execution/ConvDepthWiseExecution.hpp
@ -14,9 +14,30 @@
 #include "core/Execution.hpp"
 namespace MNN {
 namespace CUDA {
+
+struct constBuffer {
+    int pad[2];
+    int kernelSize[2];
+    int stride[2];
+    int dilate[2];
+    int inputSize[2];
+    int outputSize[2];
+    int channel;
+    int total;
+    int batch;
+    float minValue = -65504.0f;
+    float maxValue = 65504.0f;
+} uConstant;
+
 class ConvDepthWiseExecution : public Execution {
 public:
-    ConvDepthWiseExecution(const Op *op, Backend *bn);
+    struct Resource {
+        std::shared_ptr<Tensor> weightTensor;
+        std::shared_ptr<Tensor> biasTensor;
+        void* mFilter;
+        void* mBias;
+    };
+    ConvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource);
    virtual ~ConvDepthWiseExecution();
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@ -25,17 +46,13 @@ protected:
    std::pair<void*, int> mConstBuffer;
    const Op *mOp;
    int mTotalCount;
-
-    void* mFilter;
-    void* mBias;
-    std::shared_ptr<Tensor> weightTensor;
-    std::shared_ptr<Tensor> biasTensor;
-    bool use_bias_=false;
+    constBuffer parameters;
+    std::shared_ptr<Resource> mResource;
 };

 class DeconvDepthWiseExecution : public ConvDepthWiseExecution {
 public:
-    DeconvDepthWiseExecution(const Op *op, Backend *bn) : ConvDepthWiseExecution(op, bn) {
+    DeconvDepthWiseExecution(const Op *op, Backend *bn, std::shared_ptr<Resource> resource) : ConvDepthWiseExecution(op, bn, resource) {
        // Do nothing
    }
    virtual ~DeconvDepthWiseExecution() {
--- a/source/backend/cuda/execution/ConvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/ConvSingleInputExecution.cu
@ -7,55 +7,52 @@
 //

 #include "ConvSingleInputExecution.hpp"
+#include "Raster.cuh"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"

+// 16 / sizeof(int4)
 namespace MNN {
 namespace CUDA {

-__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
-        const MatMulParam* matmulParam,
-        const float* A,
-        __half* AP) {
-    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
-    int lAlign = matmulParam->elhPack[1] * MATMULPACK;
-    int maxCount = eAlign * lAlign;
-    int kernelCount = param->kernelX * param->kernelY;
-    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < maxCount; index += blockDim.x * gridDim.x) {
-        int eIndex = index % eAlign;
-        int lIndex = index / eAlign;
-        // Compute for dest
-        int eU = eIndex / MATMULPACK;
-        int eR = eIndex % MATMULPACK;
-        int lU = lIndex / MATMULPACK;
-        int lR = lIndex % MATMULPACK;
-        auto dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lU * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR;
-        if (eIndex >= matmulParam->elh[0] || lIndex >= matmulParam->elh[1]) {
-            AP[dstOffset] = 0.0;
+__global__ void KernelReorder(const float* B, half* BP, int kw, int kh, int ic, int oc, int ocPack) {
+    int icC4 = UP_DIV(ic, PACK_NUMBER);
+    int kernelCount = kw * kh;
+    int l = icC4 * kernelCount * PACK_NUMBER;
+    int h = oc;
+    int lDiv = UP_DIV(l, MATMULPACK);
+    int lAlign = lDiv * MATMULPACK;
+    int hAlign = UP_DIV(h, ocPack) * ocPack;
+    int maxCount = hAlign * lAlign;
+
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int lR = indexO % MATMULPACK;
+        int tmp = indexO / MATMULPACK;
+        int hR = tmp % ocPack;
+        int tmp2 = tmp / ocPack;
+        int lC = tmp2 % lDiv;
+        int hC = tmp2 / lDiv;
+        half* dst = BP + indexO;
+        int sH = hC * ocPack + hR;
+        int sL = lC * MATMULPACK + lR;
+        if (sH >= oc) {
+            *dst = 0.0;
            continue;
        }
-        // Compute for source
-        int ox = eIndex % param->ow;
-        int oy = eIndex / param->ow;
-        int ob = oy / param->oh;
-        oy = oy % param->oh;
-        int sz = lIndex / kernelCount;
-        int kI = lIndex % kernelCount;
-        int ksx = kI % param->kernelX;
-        int ksy = kI / param->kernelX;
-
-        int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
-        int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
-        if (sx >= 0 && sx < param->iw) {
-            if (sy >=0 && sy < param->ih) {
-                __half value = A[sz * param->ih * param->iw + ob * param->iw * param->ih * param->icDiv4 + sy * param->iw + sx];
-                AP[dstOffset] = value;
-                continue;
-            }
+        int sLR = sL % PACK_NUMBER;
+        int sLC = sL / PACK_NUMBER;
+        int iLC = sLC / (kernelCount);
+        int ik = sLC % kernelCount;
+        int iz = iLC * PACK_NUMBER + sLR;
+        if (iz >= ic) {
+            *dst = 0.0;
+            continue;
        }
-        AP[dstOffset] = 0.0;
+        const float* src = B + sH * kernelCount * ic + ik + iz * kernelCount;
+        *dst = *src;
    }
 }

-
 ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    mBackend = bn;
    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
@ -78,40 +75,91 @@ ConvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
    mKernelInfo.kernelN = common->outputCount();
    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
+    int icDiv = UP_DIV(mKernelInfo.kernelC, PACK_NUMBER);

    MatMulParam param;
    int e = 0;
-    int l = mKernelInfo.kernelX * mKernelInfo.kernelY * mKernelInfo.kernelC;
+    int l = mKernelInfo.kernelX * mKernelInfo.kernelY * icDiv * MATMULPACK;
    int h = mKernelInfo.kernelN;
    param.elh[0] = e;
    param.elh[1] = l;
    param.elh[2] = h;
-    param.elhPack[0] = UP_DIV(e, 16);
-    param.elhPack[1] = UP_DIV(l, 16);
-    param.elhPack[2] = UP_DIV(h, 16);
+    param.elhPack[0] = UP_DIV(e, MATMULPACK);
+    param.elhPack[1] = UP_DIV(l, MATMULPACK);
+    param.elhPack[2] = UP_DIV(h, MATMULPACK);
    param.bStride[0] = 0;
    param.bStride[1] = 1;
    param.bStride[2] = l;

-    auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
-    auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
-    float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
-    runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
-    runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
+    FuseRegion reg;
+    int maxOffsetNumber = 8;
+    std::vector<int> offset(maxOffsetNumber);
+    auto regionStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(FuseRegion));
+    auto offsetGpuStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(int) * maxOffsetNumber);
+    auto offsetGpu = (uint8_t*)offsetGpuStorage.first + offsetGpuStorage.second;
+
    // Reorder weight
-    weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
-    bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
-    mFilter = (void *)weightTensor.get()->buffer().device;
-    GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), nullptr, nullptr, cacheWeight, (__half*)mFilter);
-    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
-    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);
+    {
+        auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
+        float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
+        runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
+        weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[1] * param.elhPack[2] * (MATMULPACK * MATMULPACK)}));
+        bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
+        mFilter = (void *)weightTensor.get()->buffer().device;
+        auto& prop = runtime->prop();
+        int cores = prop.multiProcessorCount;
+        int threadNumbers = prop.maxThreadsPerBlock;
+        if (param.elhPack[2] % 2 == 0) {
+            KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
+                    mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, 32);
+            mUsePack = true;
+        } else {
+            KernelReorder<<<cores, threadNumbers>>>((float*)cacheWeight, (half*)mFilter,
+                    mKernelInfo.kernelX, mKernelInfo.kernelY, mKernelInfo.kernelC, mKernelInfo.kernelN, MATMULPACK);
+        }
+        static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
+    }

    // Copy Bias
    int biasSize = conv->bias()->size();
    biasTensor.reset(Tensor::createDevice<float>({biasSize}));
    bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
+
+    auto tempBiasStorage = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(conv->bias()->size()*sizeof(float));
+    auto biasTemp = (float*)((uint8_t*)tempBiasStorage.first + tempBiasStorage.second);
+    cuda_check(cudaMemcpy(biasTemp, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+
+    // FP32 -> FP16
    mBias = (void *)biasTensor.get()->buffer().device;
-    cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+    int alignSize = UP_DIV(conv->bias()->size(), PACK_NUMBER) * PACK_NUMBER;
+    reg.size[0] = 1;
+    reg.size[1] = 1;
+    reg.size[2] = alignSize;
+    reg.srcStride[0] = 0;
+    reg.srcStride[1] = 0;
+    reg.srcStride[2] = 1;
+    reg.dstStride[0] = 0;
+    reg.dstStride[1] = 0;
+    reg.dstStride[2] = 1;
+    offset[0] = 1;
+    offset[1] = 1;
+    offset[2] = conv->bias()->size();
+    offset[3] = 0;
+    offset[4] = 1;
+    offset[5] = 1;
+    offset[6] = reg.size[2];
+    offset[7] = 0;
+    reg.fuseNumber = 1;
+    runtime->memcpy((uint8_t*)regionStorage.first + regionStorage.second, &reg, sizeof(FuseRegion), MNNMemcpyHostToDevice, true);
+    runtime->memcpy(offsetGpu, offset.data(), 8 * sizeof(int), MNNMemcpyHostToDevice, true);
+    if (static_cast<CUDABackend*>(bn)->useFp16()) {
+        FuseRasterBlitFloatToHalf((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime);
+    } else {
+        FuseRasterBlitCommon((uint8_t*)mBias, (uint8_t*)biasTemp, (FuseRegion*)((uint8_t*)regionStorage.first + regionStorage.second), offsetGpu, runtime, 4);
+    }
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(regionStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(offsetGpuStorage);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempBiasStorage);
 }

 ConvSingleInputExecution::Resource::~Resource() {
@ -146,14 +194,16 @@ bool ConvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** ds
 ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    auto input = inputs[0], output = outputs[0];
-    const int UNIT = 1;
+    const int UNIT = PACK_NUMBER;
    auto convCommon = mOp->main_as_Convolution2D()->common();
    auto pads = ConvolutionCommon::convolutionPadFull(input, output, mOp->main_as_Convolution2D()->common());
+    int ic = input->channel();
+    int icDiv = UP_DIV(ic, PACK_NUMBER);
    mIm2ColParamter.dilateX         = convCommon->dilateX();
    mIm2ColParamter.dilateY         = convCommon->dilateY();
    mIm2ColParamter.strideX         = convCommon->strideX();
    mIm2ColParamter.strideY         = convCommon->strideY();
-    mIm2ColParamter.icDiv4          = input->channel();
+    mIm2ColParamter.icDiv4          = icDiv;
    mIm2ColParamter.kernelX         = convCommon->kernelX();
    mIm2ColParamter.kernelY         = convCommon->kernelY();
    mIm2ColParamter.padX = std::get<0>(pads);
@ -169,21 +219,21 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,

    runtime->memcpy((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second, &mIm2ColParamter, sizeof(ConvolutionCommon::Im2ColParameter), MNNMemcpyHostToDevice);

+    //MNN_PRINT("conv size:%d-%d-%d, %d-%d-%d\n", input->height(), input->width(), input->channel(), output->height(), output->width(), output->channel());
    int e = output->height() * output->width() * output->batch();
-    int l = input->channel() * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY;
+    int l = icDiv * mIm2ColParamter.kernelX * mIm2ColParamter.kernelY * MATMULPACK;
    int h = output->channel();
    mMatMulParam.elh[0] = e;
    mMatMulParam.elh[1] = l;
    mMatMulParam.elh[2] = h;
-    mMatMulParam.elhPack[0] = UP_DIV(e, 16);
-    mMatMulParam.elhPack[1] = UP_DIV(l, 16);
-    mMatMulParam.elhPack[2] = UP_DIV(h, 16);
+    mMatMulParam.elhPack[0] = UP_DIV(e, MATMULPACK);
+    mMatMulParam.elhPack[1] = UP_DIV(l, MATMULPACK);
+    mMatMulParam.elhPack[2] = UP_DIV(h, MATMULPACK);
    mMatMulParam.cStride[0] = mIm2ColParamter.ow * mIm2ColParamter.oh * h;
    mMatMulParam.cStride[1] = 1;
    mMatMulParam.cStride[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
-    mMatMulParam.split[0] = 1;
-    mMatMulParam.split[1] = 1;
-    mMatMulParam.split[2] = mIm2ColParamter.ow * mIm2ColParamter.oh;
+    mMatMulParam.minValue = -FLT_MAX;
+    mMatMulParam.maxValue = FLT_MAX;
    if (convCommon->relu()) {
        mMatMulParam.minValue = 0.0f;
    }
@ -191,12 +241,14 @@ ErrorCode ConvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs,
        mMatMulParam.minValue = 0.0f;
        mMatMulParam.maxValue = 6.0f;
    }
+    //MNN_PRINT("Im2Col temp size:%d!!!\n\n", mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
    runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);

    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
-    auto buffer = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK);
+    auto buffer = pool->alloc((size_t)sizeof(__half) * (size_t)mMatMulParam.elhPack[0] * (size_t)mMatMulParam.elhPack[1] * (size_t)MATMULPACK * (size_t)MATMULPACK);
    mIm2ColBuffer = (__half*)((uint8_t*)buffer.first + buffer.second);
    pool->free(buffer);
+
    return NO_ERROR;
 }

@ -204,21 +256,28 @@ ErrorCode ConvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs
    //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
    MNN_ASSERT(inputs.size() == 1);
    MNN_ASSERT(outputs.size() == 1);
+    auto input = inputs[0];
+    auto output = outputs[0];

    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
    const void *input_addr = (const void*)inputs[0]->deviceId();
    const void *filter_addr = mResource->mFilter;
    const void *bias_addr = mResource->mBias;
-
+    auto bn = backend();
    void *output_addr = (void*)outputs[0]->deviceId();
-    auto& prop = runtime->prop();
-    int threads_num = prop.maxThreadsPerBlock;
-    int cores = prop.multiProcessorCount;
+
    auto gpuIm2Col = (const ConvolutionCommon::Im2ColParameter*)((uint8_t*)mGpuIm2ColParam.first + mGpuIm2ColParam.second);
    auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);
-    //runtime->memset(mIm2ColBuffer, 0, mMatMulParam.elhPack[0] * mMatMulParam.elhPack[1] * sizeof(__half) * (MATMULPACK * MATMULPACK));
-    Im2Col<<<cores, threads_num>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
-    GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const float*)bias_addr);
+    // Im2Col func
+    Im2ColMain(runtime, &mMatMulParam, gpuMatMul, &mIm2ColParamter, gpuIm2Col, (const float*)input_addr, mIm2ColBuffer, bytes);
+
+    if (mResource->mUsePack) {
+        GemmPacked16x32(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
+    } else {
+        //printf("NotPack:%d-%d-%d-%d-%d, %d-%d-%d\n", mIm2ColParamter.icDiv4, mIm2ColParamter.ih, mIm2ColParamter.iw, mIm2ColParamter.oh, mIm2ColParamter.ow, mMatMulParam.elhPack[0], mMatMulParam.elhPack[1], mMatMulParam.elhPack[2]);
+        GemmPackedFullMain(runtime, &mMatMulParam, gpuMatMul, (float*)output_addr, (const __half*)mIm2ColBuffer, (const __half*)filter_addr, (const half*)bias_addr, bytes);
+    }

    return NO_ERROR;
 }
--- a/source/backend/cuda/execution/ConvSingleInputExecution.hpp
+++ b/source/backend/cuda/execution/ConvSingleInputExecution.hpp
@ -11,7 +11,9 @@

 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
-#include "TensorCoreGemm.cuh"
+#include "TensorCoreGemmPacked.cuh"
+#include "ImageColumn.cuh"
+
 namespace MNN {
 namespace CUDA {

@ -40,6 +42,7 @@ public:
        std::shared_ptr<Tensor> biasTensor;
        KernelInfo mKernelInfo;
        Backend* mBackend = nullptr;
+        bool mUsePack = false;
    };
    ConvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res);
    virtual ~ConvSingleInputExecution();
@ -58,6 +61,7 @@ private:
    std::pair<void*, int> mGpuIm2ColParam;

    __half* mIm2ColBuffer;
+    std::pair<void*, int> mGpuKernelParam;
 };

 } // namespace CUDA
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.cu
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.cu
@ -11,263 +11,302 @@
 namespace MNN {
 namespace CUDA {

-template <typename T>
-__global__ void cutPad(const size_t size, const T* input, const int old_height,
-                    const int old_width, const int height, const int width, const int pad_top,
-                    const int pad_left, T* output) {
-    for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
-        int block_num = pos / (width*height);
-        int left = pos % (width*height);
-        const int out_w = left % width;
-        const int out_h = left / width % height;
+__global__ void DeconvInputRerange(const int count,
+        const InputReorderParameter* param,
+        const float* Inp,
+        __half* InpRe
+        ) {
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+        int l = param->l_size;
+        int h = param->h_size;
+        int lIndex = i % l;
+        int hIndex = i / l;
+        int lU = lIndex / 16;
+        int lR = lIndex % 16;
+        int hU = hIndex / 16;
+        int hR = hIndex % 16;

-        output[pos] = input[(block_num * old_height + out_h + pad_top) * old_width + out_w + pad_left];
+        int bIndex = hIndex / param->hw_size;
+        int hwIndex = hIndex % param->hw_size;
+
+        float value = Inp[bIndex * param->ib_stride + lIndex * param->ic_stride + hwIndex];
+        //inpRe[lIndex * param->oc_stride + bIndex * param->ob_stride + hwIndex] = value;
+
+        //__half* dst = InpRe + lU * param->hpack_size * 16 * 16 + hU * 16 * 16 + hR + lR * 16;
+        __half* dst = InpRe + hU * param->lpack_size * 16 * 16 + lU * 16 * 16 + lR + hR * 16;
+        dst[0] = value;
    }
-    return;
 }

-DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op) : Execution(backend), mOp(op) {
-    //MNN_PRINT("cuda DeconvSingleInput onInit in\n");
+template <typename Dtype>
+__global__ void Col2Im(const int n, const Dtype* data_col,
+    const int batch, const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    const Dtype* bias, Dtype* data_im) {
+    for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < (n); index += blockDim.x * gridDim.x) {
+        Dtype val = 0;
+        const int b_im = index / (channels * width * height);
+        const int chw  = index % (channels * width * height);
+        const int w_im = chw % width + pad_w;
+        const int h_im = (chw / width) % height + pad_h;
+        const int c_im = chw / (width * height);
+        int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+        int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+        // compute the start and end of the output
+        const int w_col_start =
+            (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+        const int w_col_end = min(w_im / stride_w + 1, width_col);
+        const int h_col_start =
+            (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+        const int h_col_end = min(h_im / stride_h + 1, height_col);
+        // TODO: use LCM of stride and dilation to avoid unnecessary loops
+        for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+            for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+                int h_k = (h_im - h_col * stride_h);
+                int w_k = (w_im - w_col * stride_w);
+                if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+                    h_k /= dilation_h;
+                    w_k /= dilation_w;
+                    int data_col_index = ((((c_im * kernel_h + h_k) * kernel_w + w_k) * batch + b_im) *
+                                            height_col + h_col) * width_col + w_col;
+                    val += data_col[data_col_index];
+                }
+            }
+        }
+
+        if(nullptr != bias) {
+            val += bias[c_im];
+        }
+        data_im[index] = val;
+    }
+}
+
+
+DeconvSingleInputExecution::Resource::Resource(Backend* bn, const MNN::Op* op) {
+    mBackend = bn;
+    auto runtime = static_cast<CUDABackend*>(bn)->getCUDARuntime();
+    
    auto conv       = op->main_as_Convolution2D();
    auto common     = conv->common();
-
-    mKernelInfo.groups         = common->group();
    mKernelInfo.kernelX        = common->kernelX();
    mKernelInfo.kernelY        = common->kernelY();
-    mKernelInfo.padMode        = common->padMode();
-    mKernelInfo.padX           = common->padX();
-    mKernelInfo.padY           = common->padY();
-
-    if (nullptr != common->pads()) {
-        mKernelInfo.padX = common->pads()->data()[1];
-        mKernelInfo.padY = common->pads()->data()[0];
-    }
-    pad_left_  = mKernelInfo.padX;
-    pad_right_ = mKernelInfo.padX;
-    pad_top_ = mKernelInfo.padY;
-    pad_bottom_ = mKernelInfo.padY;
-
+    mKernelInfo.groups         = common->group();
    mKernelInfo.strideX        = common->strideX();
    mKernelInfo.strideY        = common->strideY();
    mKernelInfo.dilateX        = common->dilateX();
    mKernelInfo.dilateY        = common->dilateY();
    mKernelInfo.activationType = common->relu() ? 1 : (common->relu6() ? 2 : 0);

-    use_relu_ = (mKernelInfo.activationType == 1);
-    use_relu6_ = (mKernelInfo.activationType == 2);
-
-    cudnn_handle_ = nullptr;
-    input_desc_ = nullptr;
-    output_desc_ = nullptr;
-    filter_desc_ = nullptr;
-    conv_desc_ = nullptr;
-    padded_desc_ = nullptr;
-    cudnn_data_type_ = CUDNN_DATA_FLOAT;
-    cudnn_data_type_len_ = 0;
-
-    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
-    cudnn_handle_ = runtime->cudnn_handle();
-    cudnn_check(cudnnCreateTensorDescriptor(&input_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&output_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&padded_desc_));
-    cudnn_check(cudnnCreateTensorDescriptor(&bias_desc_));
-    cudnn_check(cudnnCreateFilterDescriptor(&filter_desc_));
-    cudnn_check(cudnnCreateConvolutionDescriptor(&conv_desc_));
-    cudnn_check(cudnnCreateActivationDescriptor(&act_desc_));
-
-
    //weight host->device
    const float* filterDataPtr = nullptr;
    int weightSize = 0;
    std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
    ConvolutionCommon::getConvParameters(&quanCommon, conv, &filterDataPtr, &weightSize);
-    weightTensor.reset(Tensor::createDevice<float>({weightSize}));
-    backend->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
+    mKernelInfo.kernelN = common->outputCount();
+    mKernelInfo.kernelC = weightSize / mKernelInfo.kernelN / mKernelInfo.kernelX / mKernelInfo.kernelY;
+
+    MatMulParam param;
+    int e = mKernelInfo.kernelN * mKernelInfo.kernelX * mKernelInfo.kernelY;
+    int l = mKernelInfo.kernelC;
+    int h = 0;
+    param.elh[0] = e;
+    param.elh[1] = l;
+    param.elh[2] = h;
+    param.elhPack[0] = UP_DIV(e, 16);
+    param.elhPack[1] = UP_DIV(l, 16);
+    param.elhPack[2] = UP_DIV(h, 16);
+
+    param.aStride[0] = 1;
+    param.aStride[1] = e;
+    param.aStride[2] = 0;
+
+    auto gpuParam = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(sizeof(MatMulParam));
+    auto tempCacheBuffer = static_cast<CUDABackend*>(bn)->getStaticBufferPool()->alloc(weightSize * sizeof(float));
+    float* cacheWeight = (float*)((uint8_t*)tempCacheBuffer.first + tempCacheBuffer.second);
+    runtime->memcpy(cacheWeight, filterDataPtr, weightSize * sizeof(float), MNNMemcpyHostToDevice);
+    runtime->memcpy((uint8_t*)gpuParam.first + gpuParam.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);
+    
+    // Reorder weight
+    weightTensor.reset(Tensor::createDevice<int16_t>({param.elhPack[0] * param.elhPack[1] * (MATMULPACK * MATMULPACK)}));
+    bn->onAcquireBuffer(weightTensor.get(), Backend::STATIC);
    mFilter = (void *)weightTensor.get()->buffer().device;
-    cuda_check(cudaMemcpy(mFilter, filterDataPtr, weightSize*sizeof(float), cudaMemcpyHostToDevice));
+    GemmPrepareRerange(runtime, &param, (const MatMulParam*)((uint8_t*)gpuParam.first + gpuParam.second), cacheWeight, (__half*)mFilter, nullptr, nullptr, 4);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(tempCacheBuffer);
+    static_cast<CUDABackend*>(bn)->getStaticBufferPool()->free(gpuParam);

+    // Copy Bias
+    int biasSize = conv->bias()->size();
+    biasTensor.reset(Tensor::createDevice<float>({biasSize}));
+    bn->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
+    mBias = (void *)biasTensor.get()->buffer().device;
+    cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
+    
+}

-    if(conv->bias()->size() != 0) {
-        int biasSize = conv->bias()->size();
-        biasTensor.reset(Tensor::createDevice<float>({biasSize}));
-        backend->onAcquireBuffer(biasTensor.get(), Backend::STATIC);
-        mBias = (void *)biasTensor.get()->buffer().device;
-
-        cuda_check(cudaMemcpy(mBias, conv->bias()->data(), conv->bias()->size()*sizeof(float), cudaMemcpyHostToDevice));
-        
-        int bias_size = conv->bias()->size();
-        int dim_bias[] = {1, bias_size, 1, 1};
-        int stride_bias[] = {bias_size, 1, 1, 1};
-        if(cudnn_data_type_ == CUDNN_DATA_FLOAT) {
-            cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_FLOAT, 4, dim_bias, stride_bias));
-        }
-        else if(cudnn_data_type_ == CUDNN_DATA_HALF) {
-            cudnn_check(cudnnSetTensorNdDescriptor(bias_desc_, CUDNN_DATA_HALF, 4, dim_bias, stride_bias));
-        } else {
-            MNN_PRINT("only supports fp32/fp16 data type!!!\n");
-        }
-        use_bias_ = true;
-    }
+DeconvSingleInputExecution::Resource::~Resource() {
+    // Do nothing
+}
+DeconvSingleInputExecution::DeconvSingleInputExecution(Backend* backend, const MNN::Op* op, std::shared_ptr<Resource> res) : Execution(backend), mOp(op) {
+    mResource = res;
+    auto runtime = static_cast<CUDABackend*>(backend)->getCUDARuntime();
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mGpuMatMulParam = staticPool->alloc(sizeof(MatMulParam));
+    mGpuCol2ImParam = staticPool->alloc(sizeof(Col2ImParameter));
+    mGpuInpReorderParam = staticPool->alloc(sizeof(InputReorderParameter));
 }

 DeconvSingleInputExecution::~DeconvSingleInputExecution() {
-    cudnn_check(cudnnDestroyConvolutionDescriptor(conv_desc_));
-    cudnn_check(cudnnDestroyFilterDescriptor(filter_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(padded_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(output_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(input_desc_));
-    cudnn_check(cudnnDestroyTensorDescriptor(bias_desc_));
-    cudnn_check(cudnnDestroyActivationDescriptor(act_desc_));
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mGpuMatMulParam);
+    staticPool->free(mGpuCol2ImParam);
+    staticPool->free(mGpuInpReorderParam);
+}
+bool DeconvSingleInputExecution::onClone(Backend* bn, const Op* op, Execution** dst) {
+    if (!mValid) {
+        return false;
+    }
+    if (nullptr == dst) {
+        return true;
+    }
+    auto dstExe = new DeconvSingleInputExecution(bn, op, mResource);
+    *dst = dstExe;
+    return true;
 }

+
 ErrorCode DeconvSingleInputExecution::onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
-    // prepare
-    //MNN_PRINT("cuda DeconvSingleInput onResize in, pad:%d\n", mKernelInfo.padX);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    auto input = inputs[0], output = outputs[0];
+    const int UNIT = 1;
+    auto convCommon = mOp->main_as_Convolution2D()->common();

-    mIOInfo.iw = input->width();
-    mIOInfo.ih = input->height();
-    mIOInfo.ic = input->channel();
-    mIOInfo.ib = input->batch();
-    
-    mIOInfo.ow = output->width();
-    mIOInfo.oh = output->height();
-    mIOInfo.oc = output->channel();
-    mIOInfo.ob = output->batch();
+    // Input Rerange Param
+    mInpReorderParameter.hw_size = input->height() * input->width();
+    mInpReorderParameter.ic_stride = mInpReorderParameter.hw_size;
+    mInpReorderParameter.ib_stride = mInpReorderParameter.hw_size * input->channel();
+    mInpReorderParameter.oc_stride = mInpReorderParameter.ib_stride;
+    mInpReorderParameter.ob_stride = mInpReorderParameter.hw_size;
+    mInpReorderParameter.l_size    = input->channel();
+    mInpReorderParameter.h_size    = input->batch() * mInpReorderParameter.hw_size;
+    mInpReorderParameter.lpack_size = UP_DIV(mInpReorderParameter.l_size, 16);
+    mInpReorderParameter.hpack_size = UP_DIV(mInpReorderParameter.h_size, 16);

-    mKernelInfo.kernelN = output->channel();
-    mKernelInfo.kernelC = input->channel() / mKernelInfo.groups;
+    runtime->memcpy((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second, &mInpReorderParameter, sizeof(InputReorderParameter), MNNMemcpyHostToDevice);

-    std::vector<int> in_shape = {mIOInfo.ib, mIOInfo.ic, mIOInfo.ih, mIOInfo.iw};
-    std::vector<int> output_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
-    std::vector<int> filter_shape = {mKernelInfo.kernelC, mKernelInfo.kernelN, mKernelInfo.kernelY, mKernelInfo.kernelX};//deconv (ic oc kh kw)
-    
-    // printf("filter:%d %d %d %d\n", filter_shape[0], filter_shape[1], filter_shape[2], filter_shape[3]);
-    // printf("input:%d %d %d %d\n", in_shape[0], in_shape[1], in_shape[2], in_shape[3]);
-    // printf("output:%d %d %d %d\n", output_shape[0], output_shape[1], output_shape[2], output_shape[3]);
-    cudnn_check(cudnnSetTensor4dDescriptor(input_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, in_shape[0],
-                                in_shape[1], in_shape[2], in_shape[3]));
- 
-    cudnn_check(cudnnSetFilter4dDescriptor(filter_desc_, cudnn_data_type_, CUDNN_TENSOR_NCHW, filter_shape[0],
-                                filter_shape[1], filter_shape[2], filter_shape[3]));
-    cudnn_check(cudnnSetTensor4dDescriptor(output_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0],
-                                output_shape[1], output_shape[2], output_shape[3]));
+    // Col2Im Param
+    auto pad = ConvolutionCommon::convolutionTransposePad(input, output, mOp->main_as_Convolution2D()->common());
+    mCol2ImParamter.dilateX         = convCommon->dilateX();
+    mCol2ImParamter.dilateY         = convCommon->dilateY();
+    mCol2ImParamter.strideX         = convCommon->strideX();
+    mCol2ImParamter.strideY         = convCommon->strideY();
+    mCol2ImParamter.ic              = input->channel();
+    mCol2ImParamter.oc              = output->channel();
+    mCol2ImParamter.kernelX         = convCommon->kernelX();
+    mCol2ImParamter.kernelY         = convCommon->kernelY();
+    mCol2ImParamter.padX = pad.first;
+    mCol2ImParamter.padY = pad.second;
+
+    mCol2ImParamter.ih = input->height();
+    mCol2ImParamter.iw = input->width();
+    mCol2ImParamter.oh = output->height();
+    mCol2ImParamter.ow = output->width();
+    mCol2ImParamter.ob = output->batch();
+
+    runtime->memcpy((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second, &mCol2ImParamter, sizeof(Col2ImParameter), MNNMemcpyHostToDevice);
+
+    // Matmul Param
+    int e = output->channel() * mCol2ImParamter.kernelX * mCol2ImParamter.kernelY;
+    int l = input->channel();
+    int h = input->height() * input->width() * output->batch();
+
+    mMatMulParam.elh[0] = e;
+    mMatMulParam.elh[1] = l;
+    mMatMulParam.elh[2] = h;
+    mMatMulParam.elhPack[0] = UP_DIV(e, 16);
+    mMatMulParam.elhPack[1] = UP_DIV(l, 16);
+    mMatMulParam.elhPack[2] = UP_DIV(h, 16);
+
+    mMatMulParam.bStride[0] = 0;
+    mMatMulParam.bStride[1] = input->height() * input->width();
+    mMatMulParam.bStride[2] = 1;
+
+    mMatMulParam.cStride[0] = h;
+    mMatMulParam.cStride[1] = 1;
+    mMatMulParam.cStride[2] = 1;
+    if (convCommon->relu()) {
+        mMatMulParam.minValue = 0.0f;
+    }
+    if (convCommon->relu6()) {
+        mMatMulParam.minValue = 0.0f;
+        mMatMulParam.maxValue = 6.0f;
+    }
+    runtime->memcpy((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second, &mMatMulParam, sizeof(MatMulParam), MNNMemcpyHostToDevice);

    
+    // Alloc temp cuda memory
+    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
+    auto buffer1 = pool->alloc(sizeof(float) * mMatMulParam.elh[0] * mMatMulParam.elh[2]);
+    auto buffer2 = pool->alloc(sizeof(__half) * mMatMulParam.elhPack[1] * mMatMulParam.elhPack[2] * MATMULPACK * MATMULPACK);

-    cudnnTensorDescriptor_t input_descriptor_real = nullptr;
+    mIm2ColBuffer = (float*)((uint8_t*)buffer1.first + buffer1.second);
+    mInputBuffer = (__half*)((uint8_t*)buffer2.first + buffer2.second);

-    if (mKernelInfo.padMode == PadMode_SAME) {
-        int kernelWidthSize = (mKernelInfo.kernelX - 1) * mKernelInfo.dilateX + 1;
-        int kernelHeightSize = (mKernelInfo.kernelY - 1) * mKernelInfo.dilateY + 1;
-        int pw = (mIOInfo.iw - 1) * mKernelInfo.strideX + kernelWidthSize - mIOInfo.ow;
-        int ph = (mIOInfo.ih - 1) * mKernelInfo.strideY + kernelHeightSize - mIOInfo.oh;
-        pad_left_  = pw/2;
-        pad_right_ = pw - pad_left_;
-        pad_top_ = ph/2;
-        pad_bottom_ = ph - pad_top_;
-    }
+    pool->free(buffer2);
+    pool->free(buffer1);

-    use_pad_ = (pad_left_!=0 || pad_right_!=0 || pad_top_!=0 || pad_bottom_!=0 ) ? true : false;
-
-    if(use_pad_) {
-        int totalSize = output_shape[0]*output_shape[1]*(output_shape[2]+pad_top_+pad_bottom_)*(output_shape[3]+pad_left_+pad_right_);
-        padTensor.reset(Tensor::createDevice<float>({totalSize}));
-        backend()->onAcquireBuffer(padTensor.get(), Backend::DYNAMIC);
-        mPadPtr = (void *)padTensor.get()->buffer().device;
-
-        //dynamic memory release
-        backend()->onReleaseBuffer(padTensor.get(), Backend::DYNAMIC);
-
-        cudnn_check(cudnnSetTensor4dDescriptor(padded_desc_, CUDNN_TENSOR_NCHW, cudnn_data_type_, output_shape[0], output_shape[1],
-            output_shape[2] + +pad_top_+pad_bottom_, output_shape[3] + pad_left_+pad_right_));
-    }
-    input_descriptor_real = use_pad_ ? padded_desc_ : input_desc_;
-
-    cudnn_check(cudnnSetConvolution2dDescriptor(conv_desc_, 0, 0, mKernelInfo.strideY, mKernelInfo.strideX, 
-                                mKernelInfo.dilateY, mKernelInfo.dilateX, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
-    if (cudnn_data_type_ == CUDNN_DATA_HALF) {
-        cudnn_check(cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
-    }
-    //set group num
-    cudnn_check(cudnnSetConvolutionGroupCount(conv_desc_, mKernelInfo.groups));
-    
-    // algorithm
-    constexpr int requested_algo_count = 1;
-    int returned_algo_count;
-    cudnnConvolutionBwdDataAlgoPerf_t perf_results;
-    cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_,
-        output_desc_,  requested_algo_count, &returned_algo_count, &perf_results));
-    conv_bwd_algo_ = perf_results.algo;
-
-    // workspace
-    cudnn_check(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle_, filter_desc_, input_descriptor_real, conv_desc_, output_desc_,
-        conv_bwd_algo_, &workspace_size_));
-
-    if (workspace_size_ != 0) {
-        int workspaceSize = workspace_size_;
-        workspaceTensor.reset(Tensor::createDevice<float>({workspaceSize}));
-        //cudnn not support workspace memory reuse
-        backend()->onAcquireBuffer(workspaceTensor.get(), Backend::STATIC);
-        mWorkSpace = (void *)workspaceTensor.get()->buffer().device;
-    }
-
-    if(use_relu_) {
-        cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
-    } else if(use_relu6_) {
-        cudnn_check(cudnnSetActivationDescriptor(act_desc_, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_NOT_PROPAGATE_NAN, 6.0));
-    } else {
-        //do nothing
-    }
-    //MNN_PRINT("cuda DeconvSingleInput onResize out\n");
    return NO_ERROR;
 }

 ErrorCode DeconvSingleInputExecution::onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) {
-    //MNN_PRINT("cuda DeconvSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
-
+    //MNN_PRINT("cuda convSingleInput onExecute in, inputsize:%d %d\n", (int)inputs.size(), workspace_size_);
    MNN_ASSERT(inputs.size() == 1);
    MNN_ASSERT(outputs.size() == 1);
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);

    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    const void *input_addr = (const void*)inputs[0]->deviceId();
-    const void *filter_addr = mFilter;
-    const void *bias_addr = mBias;
-
+    const void *filter_addr = mResource->mFilter;
+    const void *bias_addr = mResource->mBias;
    void *output_addr = (void*)outputs[0]->deviceId();
-    void *workspace_addr = nullptr;
-    if (workspace_size_ != 0) {
-        workspace_addr = mWorkSpace;
-    }

-    const float alpha = 1;
-    const float beta = 0;
+    auto gpuInpReorder = (const InputReorderParameter*)((uint8_t*)mGpuInpReorderParam.first + mGpuInpReorderParam.second);
+    auto gpuCol2Im = (const Col2ImParameter*)((uint8_t*)mGpuCol2ImParam.first + mGpuCol2ImParam.second);
+    auto gpuMatMul = (const MatMulParam*)((uint8_t*)mGpuMatMulParam.first + mGpuMatMulParam.second);

+    const int rerangeCount = mInpReorderParameter.ib_stride * inputs[0]->batch();
+    int inp_block_num = runtime->blocks_num(rerangeCount);
+    int inp_thread_num = runtime->threads_num();

-    if(use_pad_) {
-        cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
-            conv_bwd_algo_, workspace_addr, workspace_size_, &beta, padded_desc_, mPadPtr));
+    // Do input Rerange
+    runtime->memset(mInputBuffer, 0, mMatMulParam.elhPack[2] * mMatMulParam.elhPack[1] * MATMULPACK * MATMULPACK * sizeof(__half));
+    DeconvInputRerange<<<inp_block_num, inp_thread_num>>>(rerangeCount, gpuInpReorder, (const float*)input_addr, mInputBuffer);

-        std::vector<int> out_shape = {mIOInfo.ob, mIOInfo.oc, mIOInfo.oh, mIOInfo.ow};
+    // Do Gemm operation 
+    GemmPackedMain(runtime, &mMatMulParam, gpuMatMul, (float*)mIm2ColBuffer, (const half*)filter_addr, (const half*)mInputBuffer, nullptr, bytes, false, false);

-        int size = out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3];
-        int block_num = runtime->blocks_num(size);
-        int threads_num = runtime->threads_num();
+    // Do Col2Im trans
+    int height_col = mCol2ImParamter.ih;
+    int width_col = mCol2ImParamter.iw;
+    int num_kernels = mCol2ImParamter.ob * mCol2ImParamter.oc * mCol2ImParamter.oh * mCol2ImParamter.ow;

-        cutPad<<<block_num, threads_num>>>(size, (float*)mPadPtr, out_shape[2]+pad_top_+pad_bottom_, out_shape[3]+pad_left_+pad_right_,
-            out_shape[2], out_shape[3], pad_top_, pad_left_, (float*)output_addr);
-    }
-    else {
-        cudnn_check(cudnnConvolutionBackwardData(cudnn_handle_, &alpha, filter_desc_, filter_addr, input_desc_, input_addr, conv_desc_,
-            conv_bwd_algo_, workspace_addr, workspace_size_, &beta, output_desc_, output_addr));
-    }
+    int col2im_block_num = runtime->blocks_num(num_kernels);
+    int col2im_thread_num = runtime->threads_num();
+
+    // printf("col2im:%d, %d-%d-%d-%d-%d-%d\n %d-%d-%d-%d-%d-%d\n %d-%d\n", mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, \
+    //     mCol2ImParamter.ih, mCol2ImParamter.iw, mCol2ImParamter.ic, \
+    //     mCol2ImParamter.padX, mCol2ImParamter.padY, mCol2ImParamter.kernelX, mCol2ImParamter.kernelY, mCol2ImParamter.strideX, mCol2ImParamter.strideY, \
+    //     col2im_block_num, col2im_thread_num);
+    
+    Col2Im<float><<<col2im_block_num, col2im_thread_num>>>(
+        num_kernels, (const float*)mIm2ColBuffer, mCol2ImParamter.ob, mCol2ImParamter.oh, mCol2ImParamter.ow, mCol2ImParamter.oc, 
+        mCol2ImParamter.kernelY, mCol2ImParamter.kernelX, mCol2ImParamter.padY, mCol2ImParamter.padX, 
+        mCol2ImParamter.strideY, mCol2ImParamter.strideX, mCol2ImParamter.dilateY, mCol2ImParamter.dilateX,
+        height_col, width_col, (const float*)bias_addr, (float *)output_addr);

-    if(use_bias_) {
-        cudnn_check(cudnnAddTensor(cudnn_handle_, &alpha, bias_desc_, bias_addr, &alpha, output_desc_, output_addr));
-    }
-    if(use_relu_ || use_relu6_) {
-        cudnn_check(cudnnActivationForward(cudnn_handle_, act_desc_, &alpha, output_desc_, output_addr, &beta, output_desc_, output_addr));
-    }
    return NO_ERROR;
 }

@ -287,7 +326,8 @@ public:
            MNN_PRINT("Deconv inputs size:3 not support\n");
            return nullptr;
        } else if(inputs.size() == 1) {
-            return new DeconvSingleInputExecution(backend, op);
+            std::shared_ptr<DeconvSingleInputExecution::Resource> resource(new DeconvSingleInputExecution::Resource(backend, op));
+            return new DeconvSingleInputExecution(backend, op, resource);
        } else {
            MNN_PRINT("Deconv inputs size:%d not support", (int)inputs.size());
            return nullptr;
@ -295,7 +335,7 @@ public:
    }
 };

-CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);
+//CUDACreatorRegister<CUDADeconvolutionCreator> __DeConvExecution(OpType_Deconvolution);

 }// namespace CUDA
 }// namespace MNN
--- a/source/backend/cuda/execution/DeconvSingleInputExecution.hpp
+++ b/source/backend/cuda/execution/DeconvSingleInputExecution.hpp
@ -11,7 +11,7 @@

 #include "backend/cuda/core/CUDABackend.hpp"
 #include "core/Execution.hpp"
-#include "half.hpp"
+#include "TensorCoreGemm.cuh"

 namespace MNN {
 namespace CUDA {
@ -26,9 +26,6 @@ struct KernelInfo {
    int kernelC        = 0;
    int kernelX        = 0;
    int kernelY        = 0;
-    PadMode padMode    = PadMode_CAFFE;
-    int padX           = 0;
-    int padY           = 0;
    int strideX        = 0;
    int strideY        = 0;
    int dilateX        = 0;
@ -36,59 +33,71 @@ struct KernelInfo {
    int activationType = 0;
 };//

+struct Col2ImParameter {
+    int padX;
+    int padY;
+    int dilateX;
+    int dilateY;
+    int strideX;
+    int strideY;
+    int kernelX;
+    int kernelY;
+    int oc;
+    int ic;
+    int iw;
+    int ih;
+    int ow;
+    int oh;
+    int ob;
+};
+
+struct InputReorderParameter {
+    int ic_stride;
+    int ib_stride;
+    int oc_stride;
+    int ob_stride;
+    int hw_size;
+    int l_size;
+    int h_size;
+    int lpack_size;
+    int hpack_size;
+}; 
+
+
 extern "C"
 class DeconvSingleInputExecution : public Execution {
 public:
-    DeconvSingleInputExecution(Backend* backend, const MNN::Op* op);
+    struct Resource {
+        Resource(Backend* bn, const MNN::Op* op);
+        ~ Resource();
+        void* mFilter;
+        void* mBias;
+        std::shared_ptr<Tensor> weightTensor;
+        std::shared_ptr<Tensor> biasTensor;
+        KernelInfo mKernelInfo;
+        Backend* mBackend = nullptr;
+    };
+    DeconvSingleInputExecution(Backend* backend, const MNN::Op* op,  std::shared_ptr<Resource> res);
    virtual ~DeconvSingleInputExecution();
    virtual ErrorCode onResize(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor*> &inputs, const std::vector<Tensor*> &outputs) override;
+    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;

 private:
-    cudnnHandle_t cudnn_handle_;
-    cudnnTensorDescriptor_t input_desc_;
-    cudnnTensorDescriptor_t output_desc_;
-    cudnnFilterDescriptor_t filter_desc_;
-    cudnnConvolutionBwdDataAlgo_t conv_bwd_algo_;
-    cudnnConvolutionDescriptor_t conv_desc_;
-    cudnnTensorDescriptor_t bias_desc_;
-    cudnnTensorDescriptor_t padded_desc_;
-    cudnnActivationDescriptor_t act_desc_;
+    std::shared_ptr<Resource> mResource;

-    cudnnDataType_t cudnn_data_type_;
-    int cudnn_data_type_len_;
-    bool use_pad_ = false;
-    int pad_top_ = 0;
-    int pad_bottom_ = 0;
-    int pad_left_ = 0;
-    int pad_right_ = 0;
+    const Op* mOp = nullptr;
+    MatMulParam mMatMulParam;
+    std::pair<void*, int> mGpuMatMulParam;

-    bool use_bias_ = false;
-    bool use_relu_ = false;
-    bool use_relu6_ = false;
+    Col2ImParameter mCol2ImParamter;
+    std::pair<void*, int> mGpuCol2ImParam;

-    void* mPadPtr;
-    void* mFilter;
-    void* mBias;
-    void* mWorkSpace;
-    std::shared_ptr<Tensor> weightTensor;
-    std::shared_ptr<Tensor> biasTensor;
-    std::shared_ptr<Tensor> padTensor;
-    std::shared_ptr<Tensor> workspaceTensor;
+    InputReorderParameter mInpReorderParameter;
+    std::pair<void*, int> mGpuInpReorderParam;

-    std::shared_ptr<Tensor> mPad;
-    std::shared_ptr<Tensor> mWorkspaceForward;
-
-    size_t input_size_;
-    size_t filter_size_;
-    size_t output_size_;
-    size_t padded_size_;
-    size_t workspace_size_;
-
-    const MNN::Op* mOp;
-    KernelInfo mKernelInfo;
-    IOInfo mIOInfo;
-    std::shared_ptr<Tensor> mTempInput;
+    float* mIm2ColBuffer;
+    __half* mInputBuffer;
 };

 } // namespace CUDA
--- a/source/backend/cuda/execution/ImageColumn.cu
+++ b/source/backend/cuda/execution/ImageColumn.cu
@ -0,0 +1,705 @@
+#include "ImageColumn.cuh"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+#include "Raster.cuh"
+
+#define BLOCK_INT4 2
+
+namespace MNN {
+namespace CUDA {
+
+__global__ void Im2Col1x1(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const float* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+    int lAlign = matmulParam->elhPack[1];
+    int maxCount = eAlign * lAlign * BLOCK_INT4;
+    int kernelCount = 1;
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO >> 1;
+        int lR = indexO & 1;
+        int eIndex, lIndex;
+        eAlignD.divmod(index, lIndex, eIndex);
+        int eU = eIndex >> 4;
+        int eR = eIndex & 15;
+        int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+        int4* dst = (int4*)(AP + dstOffset);
+        if (eIndex >= matmulParam->elh[0]) {
+            *dst = {0, 0, 0, 0};
+            continue;
+        }
+        // Compute for source
+        int ox, oy, ob;
+        owD.divmod(eIndex, oy, ox);
+        ohD.divmod(oy, ob, oy);
+        int sz = lIndex;
+        int sx = ox * param->strideX - param->padX;
+        int sy = oy * param->strideY - param->padY;
+        if (sx >= 0 && sx < param->iw) {
+            if (sy >=0 && sy < param->ih) {
+                int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+                float2* srcF = (float2*)(A + offset);
+                half2* dstH = (half2*)dst;
+                dstH[0] = __float22half2_rn(srcF[0]);
+                dstH[1] = __float22half2_rn(srcF[1]);
+                dstH[2] = __float22half2_rn(srcF[2]);
+                dstH[3] = __float22half2_rn(srcF[3]);
+                continue;
+            }
+        }
+        *dst = {0, 0, 0, 0};
+    }
+}
+
+__global__ void Im2Col1x1_OPT(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const int maxCount, 
+    const float* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO >> 3;
+        int lR = indexO & 7;
+        int eIndex, lIndex;
+        eAlignD.divmod(index, lIndex, eIndex);
+        int eU = eIndex >> 4;
+        int eR = eIndex & 15;
+        int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
+
+        int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
+        float2* srcF = (float2*)(A + offset);
+        half2* dstH = (half2*)(AP + dstOffset);
+        dstH[0] = __float22half2_rn(srcF[0]);
+    }
+}
+
+__global__ void Im2Col(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const float* A,
+    half* AP) {
+    int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+    int lAlign = matmulParam->elhPack[1];
+    int maxCount = eAlign * lAlign * BLOCK_INT4;
+    int kernelCount = param->kernelX * param->kernelY;
+    for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+        int index = indexO / BLOCK_INT4;
+        int lR = indexO % BLOCK_INT4;
+        int eIndex = index % eAlign;
+        int lIndex = index / eAlign;
+        int eU = eIndex / MATMULPACK;
+        int eR = eIndex % MATMULPACK;
+        int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+        int4* dst = (int4*)(AP + dstOffset);
+        if (eIndex >= matmulParam->elh[0]) {
+            *dst = {0, 0, 0, 0};
+            continue;
+        }
+        // Compute for source
+        int ox = eIndex % param->ow;
+        int oy = eIndex / param->ow;
+        int ob = oy / param->oh;
+        oy = oy % param->oh;
+        int sz = lIndex / kernelCount;
+        int kI = lIndex % kernelCount;
+        int ksx = kI % param->kernelX;
+        int ksy = kI / param->kernelX;
+
+        int sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+        int sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+        if (sx >= 0 && sx < param->iw) {
+            if (sy >=0 && sy < param->ih) {
+                int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+                float2* srcF = (float2*)(A + offset);
+                half2* dstH = (half2*)dst;
+                dstH[0] = __float22half2_rn(srcF[0]);
+                dstH[1] = __float22half2_rn(srcF[1]);
+                dstH[2] = __float22half2_rn(srcF[2]);
+                dstH[3] = __float22half2_rn(srcF[3]);
+                continue;
+            }
+        }
+        *dst = {0, 0, 0, 0};
+    }
+}
+
+__global__ void Im2Col1x1_half(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const half* A,
+    half* AP,
+    DivModFast eAlignD,
+    DivModFast owD,
+    DivModFast ohD
+    ) {
+int eAlign = matmulParam->elhPack[0] * MATMULPACK;
+int lAlign = matmulParam->elhPack[1];
+int maxCount = eAlign * lAlign * BLOCK_INT4;
+int kernelCount = 1;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    int index = indexO / BLOCK_INT4;
+    int lR = indexO % BLOCK_INT4;
+    int eIndex, lIndex;
+    eAlignD.divmod(index, lIndex, eIndex);
+    int eU = eIndex / MATMULPACK;
+    int eR = eIndex % MATMULPACK;
+    int dstOffset = eU * matmulParam->elhPack[1] * (MATMULPACK * MATMULPACK) + lIndex * (MATMULPACK * MATMULPACK) + eR * MATMULPACK + lR * 8;
+    int4* dst = (int4*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0, 0, 0};
+        continue;
+    }
+    // Compute for source
+    int ox, oy, ob;
+    owD.divmod(eIndex, oy, ox);
+    ohD.divmod(oy, ob, oy);
+    int sz = lIndex;
+    int sx = ox * param->strideX - param->padX;
+    int sy = oy * param->strideY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            int offset = sz * param->srcZStep + (ob * param->iw * param->ih + sy * param->iw + sx) * PACK_NUMBER + lR * 8;
+            int4* src = (int4*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0, 0, 0};
+}
+}
+
+__global__ void Im2Col1x1_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
+const MatMulParam* matmulParam,
+const int maxCount, 
+const half* A,
+half* AP,
+DivModFast eAlignD,
+DivModFast owD,
+DivModFast ohD
+) {
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    int index = indexO >> 3;
+    int lR = indexO & 7;
+    int eIndex, lIndex;
+    eAlignD.divmod(index, lIndex, eIndex);
+    int eU = eIndex >> 4;
+    int eR = eIndex & 15;
+    int dstOffset = ((eU * matmulParam->elhPack[1] + lIndex) << 8) + (eR << 4) + (lR << 1);
+
+    int offset = lIndex * param->srcZStep + (eIndex << 4) + (lR << 1);
+    int* srcF = (int*)(A + offset);
+    int* dstH = (int*)(AP + dstOffset);
+    dstH[0] = srcF[0];
+}
+}
+
+__global__ void Im2Col_half(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const int maxCount,
+    const half* A,
+    half* AP,
+    DivModFast d_eA,
+    DivModFast d_ow,
+    DivModFast d_oh,
+    DivModFast d_fxy,
+    DivModFast d_fx
+    ) {
+int eAlign = matmulParam->elhPack[0] << 4;
+int lAlign = matmulParam->elhPack[1];
+int kernelCount = param->kernelX * param->kernelY;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    size_t index = indexO >> 1;
+    size_t lR = indexO & 1;
+    int eIndex, lIndex;
+    d_eA.divmod(index, lIndex, eIndex);
+    size_t eU = eIndex >> 4;
+    size_t eR = eIndex & 15;
+    size_t dstOffset = ((((eU * matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 3);
+    int4* dst = (int4*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0, 0, 0};
+        continue;
+    }
+    // Compute for source
+    int ox, oby, ob, oy, sz, kI, ksx, ksy;
+    d_ow.divmod(eIndex, oby, ox);
+    d_oh.divmod(oby, ob, oy);
+    d_fxy.divmod(lIndex, sz, kI);
+    d_fx.divmod(kI, ksy, ksx);
+
+    size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+    size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + lR * 8;
+            int4* src = (int4*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0, 0, 0};
+}
+}
+
+__global__ void Im2Col_half_OPT(const ConvolutionCommon::Im2ColParameter* param,
+    const MatMulParam* matmulParam,
+    const size_t maxCount,
+    const half* A,
+    half* AP,
+    DivModFast d_eA,
+    DivModFast d_ow,
+    DivModFast d_oh,
+    DivModFast d_fxy,
+    DivModFast d_fx
+) {
+size_t eAlign = matmulParam->elhPack[0] << 4;
+size_t lAlign = matmulParam->elhPack[1];
+size_t kernelCount = param->kernelX * param->kernelY;
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+    size_t index = indexO >> 2;
+    size_t lR = indexO & 3;
+    int eIndex, lIndex;
+    d_eA.divmod(index, lIndex, eIndex);
+    size_t eU = eIndex >> 4;
+    size_t eR = eIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex) << 4) + eR) << 4) + (lR << 2);
+    int2* dst = (int2*)(AP + dstOffset);
+    if (eIndex >= matmulParam->elh[0]) {
+        *dst = {0, 0};
+        continue;
+    }
+
+    // Compute for source
+    int ox, oby, ob, oy, sz, kI, ksx, ksy;
+    d_ow.divmod(eIndex, oby, ox);
+    d_oh.divmod(oby, ob, oy);
+    d_fxy.divmod(lIndex, sz, kI);
+    d_fx.divmod(kI, ksy, ksx);
+
+    size_t sx = ox * param->strideX + ksx * param->dilateX - param->padX;
+    size_t sy = oy * param->strideY + ksy * param->dilateY - param->padY;
+    if (sx >= 0 && sx < param->iw) {
+        if (sy >=0 && sy < param->ih) {
+            size_t offset = sz * param->srcZStep + (((ob * param->ih + sy) * param->iw + sx) << 4) + (lR << 2);
+            int2* src = (int2*)(A + offset);
+            *dst = *src;
+            continue;
+        }
+    }
+    *dst = {0, 0};
+}
+}
+
+
+__global__ void Im2Col_half_3x3S1D1P1_OPT2(const ConvolutionCommon::Im2ColParameter* param,
+const MatMulParam* matmulParam,
+const size_t maxCount,
+const half* A,
+half* AP,
+DivModFast d_eA,
+DivModFast d_ow,
+DivModFast d_oh
+) {
+for (size_t indexO = blockIdx.x * blockDim.x + threadIdx.x; indexO < maxCount; indexO += blockDim.x * gridDim.x) {
+size_t index = indexO >> 3;
+size_t lR = indexO & 7;
+int eIndex, lIndex;
+d_eA.divmod(index, lIndex, eIndex);
+
+int ix, oby, ob, iy;
+d_ow.divmod(eIndex, oby, ix);
+d_oh.divmod(oby, ob, iy);
+size_t sz = lIndex;
+
+size_t offset = sz * param->srcZStep + (((ob * param->ih + iy) * param->iw + ix) << 4) + (lR << 1);
+int src = *((int*)(A + offset));
+
+// Pixel (iy-1, ix-1)
+if(iy-1 >=0 && ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 8) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy-1, ix+0)
+if(iy-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 7) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy-1, ix+1)
+if(iy-1 >=0 && ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy-1) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 6) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy-1 ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy+0, ix-1)
+if(ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 5) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy, ix)
+if(1) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 4) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy, ix+1)
+if(ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+0) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 3) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy ==0) {
+        size_t index[3] = {0, 1, 2};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(iy == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+// Pixel (iy+1, ix-1)
+if(iy+1 < param->ih && ix-1 >=0) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix-1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 2) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix-1 ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }  
+}
+
+// Pixel (iy+1, ix)
+if(iy+1 < param->ih) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+0));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 1) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix ==0) {
+        size_t index[3] = {0, 3, 6};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+
+//Pixel (iy+1, ix+1)
+if(iy+1 < param->ih && ix+1 < param->iw) {
+    size_t oeIndex = (ob * param->ih * param->iw + (iy+1) * param->iw + (ix+1));
+    size_t eU = oeIndex >> 4;
+    size_t eR = oeIndex & 15;
+    size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + 0) << 4) + eR) << 4) + (lR << 1);
+    int* dst = (int*)(AP + dstOffset);
+    *dst = src;
+
+    // Corner case
+    if(iy+1 == param->ih-1) {
+        size_t index[3] = {6, 7, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+    if(ix+1 == param->iw-1) {
+        size_t index[3] = {2, 5, 8};
+        for(size_t i=0; i<3; i++) {
+            size_t dstOffset = ((((eU * (size_t)matmulParam->elhPack[1] + lIndex*9 + index[i]) << 4) + eR) << 4) + (lR << 1);
+            int* dst = (int*)(AP + dstOffset);
+            *dst = 0;
+        }
+    }
+}
+}
+}
+
+
+
+void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col,\
+     const void* input_addr, __half* mIm2ColBuffer, int bytes) {
+
+    size_t eAlign = cpuMatlMul->elhPack[0] * MATMULPACK;
+    size_t lAlign = cpuMatlMul->elhPack[1];
+
+    DivModFast eAlignD(eAlign);
+    DivModFast owD(cpuIm2Col->ow);
+    DivModFast ohD(cpuIm2Col->oh);
+
+    if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1 && \
+        cpuMatlMul->elh[0] % 16 == 0 && \
+        cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
+        cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
+        cpuIm2Col->padX == 0 && cpuIm2Col->padY == 0) {
+
+        size_t maxCount = eAlign * lAlign * 8;//Align 2
+        int block_num = runtime->blocks_num(maxCount);
+        int block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col1x1_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
+                    (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        } else {
+            Im2Col1x1_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount,
+                    (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        }
+    } else if (cpuIm2Col->kernelX == 1 && cpuIm2Col->kernelY == 1) {
+        size_t maxCount = eAlign * lAlign * 2;//Align 8
+        int block_num = runtime->blocks_num(maxCount);
+        int block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col1x1<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        } else {
+            Im2Col1x1_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD);
+            checkKernelErrors;
+        }
+    } else if(cpuIm2Col->kernelX == 3 && cpuIm2Col->kernelY == 3 && \
+        cpuIm2Col->strideX == 1 && cpuIm2Col->strideY == 1 && \
+        cpuIm2Col->dilateX == 1 && cpuIm2Col->dilateY == 1 && \
+        cpuIm2Col->padX == 1 && cpuIm2Col->padY == 1 && \
+        bytes == 2) {
+        
+        size_t maxCount = eAlign * (lAlign / 9) * 8;
+        size_t block_num = runtime->blocks_num(maxCount);
+        size_t block_size = runtime->threads_num();
+
+        //printf("%d-%d-%d-%d-%d, %d-%d\n", cpuIm2Col->icDiv4, cpuIm2Col->ih, cpuIm2Col->iw, cpuIm2Col->oh, cpuIm2Col->ow, eAlign, lAlign);
+        Im2Col_half_3x3S1D1P1_OPT2<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,\
+            eAlignD, owD, ohD);
+        checkKernelErrors;
+    } else {
+        size_t maxCount = eAlign * lAlign * 2;
+        size_t block_num = runtime->blocks_num(maxCount);
+        size_t block_size = runtime->threads_num();
+        if(bytes == 4) {
+            Im2Col<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, (const float*)input_addr, mIm2ColBuffer);
+            checkKernelErrors;
+        } else {
+
+            DivModFast fxyD((cpuIm2Col->kernelX*cpuIm2Col->kernelY));
+            DivModFast fxD(cpuIm2Col->kernelX);
+            maxCount = eAlign * lAlign * 4;
+            block_num = runtime->blocks_num(maxCount);
+            block_size = runtime->threads_num();
+
+            //Im2Col_half<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer, eAlignD, owD, ohD, fxyD, fxD);
+
+            Im2Col_half_OPT<<<block_num, block_size>>>(gpuIm2Col, gpuMatMul, maxCount, (const half*)input_addr, mIm2ColBuffer,
+                eAlignD, owD, ohD, fxyD, fxD);
+            checkKernelErrors;
+        }
+    }
+}
+
+} // namespace CUDA
+} // namespace MNN
--- a/source/backend/cuda/execution/ImageColumn.cuh
+++ b/source/backend/cuda/execution/ImageColumn.cuh
@ -0,0 +1,24 @@
+//
+//  ImageColumn.cuh
+//  MNN
+//
+//  Created by MNN on 2021/01/10.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef IMAGE_COLUMN_CUH
+#define IMAGE_COLUMN_CUH
+
+#include "backend/cuda/core/runtime/CUDARuntime.hpp"
+#include "TensorCoreGemm.cuh"
+#include "backend/cuda/core/CUDABackend.hpp"
+
+namespace MNN {
+namespace CUDA {
+
+void Im2ColMain(CUDARuntime* runtime, const MatMulParam* cpuMatlMul, const MatMulParam* gpuMatMul, const ConvolutionCommon::Im2ColParameter* cpuIm2Col, const ConvolutionCommon::Im2ColParameter* gpuIm2Col, const void* input_addr, __half* mIm2ColBuffer, int bytes);
+
+} // namespace CUDA
+} // namespace MNN
+#endif
+
--- a/source/backend/cuda/execution/InterpExecution.cu
+++ b/source/backend/cuda/execution/InterpExecution.cu
@ -1,27 +1,51 @@
 #include "InterpExecution.hpp"
+#include "MNNCUDADefine.hpp"
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {
-
 #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

 template<typename T>
-__global__ void INTERP(const int n, const int ih, const int iw, const int oh, const int ow, 
+__global__ void INTERP_NERAEST(const int n, const int ih, const int iw, const int oh, const int ow, 
    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
-    CUDA_KERNEL_LOOP(index, n) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
        int x = index % ow;
        int tmp = index / ow;
        int y = tmp % oh;
        int z = tmp / oh;
        int ix = min(max(0, (int)floor((float)x*scalew+offsetw)), iw-1);
        int iy = min(max(0, (int)floor((float)y*scaleh+offseth)), ih-1);
-        out[z*oh*ow + y*ow + x] = in[z*ih*iw + iy*iw + ix];
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
+            = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
+    }
+}
+
+template<typename T>
+__global__ void INTERP_NERAEST_ROUND(const int n, const int ih, const int iw, const int oh, const int ow, 
+    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
+        int x = index % ow;
+        int tmp = index / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int ix = min(max(0, (int)floor((float)x*scalew+offsetw + 0.499f)), iw-1);
+        int iy = min(max(0, (int)floor((float)y*scaleh+offseth + 0.499f)), ih-1);
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain]
+            = in[(z*ih*iw + iy*iw + ix) * PACK_NUMBER + remain];
    }
 }

 template<typename T>
 __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const int oh, const int ow, 
    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out) {
-    CUDA_KERNEL_LOOP(index, n) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total / PACK_NUMBER;
+        int remain = total % PACK_NUMBER;
        int x = index % ow;
        int tmp = index / ow;
        int y = tmp % oh;
@ -37,11 +61,97 @@ __global__ void INTERP_BILINEAR(const int n, const int ih, const int iw, const i
        int index_01 = z*ih*iw + iy_0*iw + ix_1;
        int index_10 = z*ih*iw + iy_1*iw + ix_0;
        int index_11 = z*ih*iw + iy_1*iw + ix_1;
+        index_00 = index_00 * PACK_NUMBER + remain;
+        index_01 = index_01 * PACK_NUMBER + remain;
+        index_10 = index_10 * PACK_NUMBER + remain;
+        index_11 = index_11 * PACK_NUMBER + remain;

        float factor_x = fx-ix_0;
        float factor_y = fy-iy_0;
-        out[z*oh*ow + y*ow + x] = (1.0-factor_x)*(1.0-factor_y)*in[index_00] + factor_x*(1.0-factor_y)*in[index_01] +
-                                  (1.0-factor_x)*factor_y*in[index_10] + factor_x*factor_y*in[index_11];
+        out[(z*oh*ow + y*ow + x) * PACK_NUMBER + remain] =
+            (1.0-factor_x)*(1.0-factor_y)*(float)in[index_00] + factor_x*(1.0-factor_y)*(float)in[index_01] +
+                                  (1.0-factor_x)*factor_y*(float)in[index_10] + factor_x*factor_y*(float)in[index_11];
+    }
+}
+
+template<typename T>
+__global__ void INTERP_BILINEAR_OPT(const int n, const int ih, const int iw, const int oh, const int ow, 
+    const float scaleh, const float scalew, const float offseth, const float offsetw, const T* in, T* out,
+    DivModFast d_ow, DivModFast d_oh) {
+    CUDA_KERNEL_LOOP(total, n) {
+        int index = total >> 4;
+        int remain = total & 15;
+
+        int tmp, x_idx, y, z;
+        d_ow.divmod(index, tmp, x_idx);
+        d_oh.divmod(tmp, z, y);
+
+        size_t x = x_idx << 1;
+        float fx = x*scalew+offsetw;
+        int ix_0 = min(max(0, (int)floor(fx)), iw-1);
+        int ix_1 = min((int)ceil(fx), iw-1);
+
+        float fx_1 = fx + scalew;
+        int ix_2 = min(max(0, (int)floor(fx_1)), iw-1);
+        int ix_3 = min((int)ceil(fx_1), iw-1);
+
+        float fy = y*scaleh+offseth;
+        int iy_0 = min(max(0, (int)floor(fy)), ih-1);
+        int iy_1 = min((int)ceil(fy), ih-1);
+
+        int index_00 = (z*ih+ iy_0)*iw + ix_0;
+        int index_01 = index_00 - ix_0 + ix_1;
+        int index_10 = (z*ih+ iy_1)*iw + ix_0;
+        int index_11 = index_10 - ix_0 + ix_1;
+        index_00 = (index_00 << 4) + remain;
+        index_01 = (index_01 << 4) + remain;
+        index_10 = (index_10 << 4) + remain;
+        index_11 = (index_11 << 4) + remain;
+
+        float factor_x = fx-ix_0;
+        float factor_y = fy-iy_0;
+        float in_00 = (float)in[index_00];
+        float in_01 = (float)in[index_01];
+        float in_10 = (float)in[index_10];
+        float in_11 = (float)in[index_11];
+
+        float factor_00 = (1.0-factor_x)*(1.0-factor_y);
+        float factor_01 = factor_x*(1.0-factor_y);
+        float factor_10 = (1.0-factor_x)*factor_y;
+        float factor_11 = factor_x*factor_y;
+
+        size_t dstOffset = (((z*oh+ y)*ow + x) << 4) + remain;
+        out[dstOffset] = \
+            factor_00* in_00 + factor_01*in_01 + \
+            factor_10* in_10 + factor_11*in_11;
+
+        if(x+1 >= ow) {
+            continue;
+        }
+
+        if(ix_2 != ix_0) {
+            index_00 = index_00 + ((ix_2-ix_0) << 4);
+            index_10 = index_10 + ((ix_2-ix_0) << 4);
+            in_00 = (float)in[index_00];
+            in_10 = (float)in[index_10];
+        }
+        if(ix_3 != ix_1) {
+            index_01 = index_01 + ((ix_3-ix_1) << 4);
+            index_11 = index_11 + ((ix_3-ix_1) << 4);
+            in_01 = (float)in[index_01];
+            in_11 = (float)in[index_11];
+        }
+
+        if(factor_x != fx_1-ix_2) {
+            factor_x = fx_1-ix_2;
+            factor_00 = (1.0-factor_x)*(1.0-factor_y);
+            factor_01 = factor_x*(1.0-factor_y);
+            factor_10 = (1.0-factor_x)*factor_y;
+            factor_11 = factor_x*factor_y;
+        }
+        out[dstOffset+ PACK_NUMBER] = \
+            factor_00* in_00 + factor_01*in_01 + \
+            factor_10* in_10 + factor_11*in_11;
    }
 }

@ -70,7 +180,7 @@ ErrorCode InterpExecution::onResize(const std::vector<Tensor *> &inputs, const s
    mOutputHeight = output->height();
    mOutputWidth  = output->width();

-    mCount = mBatch*mChannel*mOutputHeight*mOutputWidth;
+    mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*mOutputWidth * PACK_NUMBER;
    //printf("mBatch:%d-mChannel:%d-mInputHeight:%d- mInputWidth:%d- mOutputHeight:%d- mOutputWidth:%d, mScaleHeight:%f- mScaleWidth:%f %f %f\n", mBatch, mChannel, mInputHeight,mInputWidth,mOutputHeight, mOutputWidth, mScaleHeight, mScaleWidth, mWidthOffset, mHeightOffset);
    return NO_ERROR;
 }
@ -82,13 +192,39 @@ ErrorCode InterpExecution::onExecute(const std::vector<Tensor *> &inputs, const
    int threads_num = runtime->threads_num();
    auto input_addr = (void*)inputs[0]->deviceId();
    auto output_addr = (void*)outputs[0]->deviceId();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if(mResizeType == 1){
+            INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+        } else if(mResizeType == 2) {
+            //INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+
+            mCount = mBatch*UP_DIV(mChannel, PACK_NUMBER)*mOutputHeight*((mOutputWidth+1)/ 2) * PACK_NUMBER;
+            block_num = runtime->blocks_num(mCount);
+            threads_num = runtime->threads_num();
+
+            DivModFast d_ow((mOutputWidth+1)/2);
+            DivModFast d_oh(mOutputHeight);
+            INTERP_BILINEAR_OPT<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,\
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr, d_ow, d_oh);       
+
+        } else if (mResizeType == 4) {
+            INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+                mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const half *)input_addr, (half *)output_addr);
+        }
+        return NO_ERROR;
+    }

    if(mResizeType == 1){
-        INTERP<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+        INTERP_NERAEST<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
            mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
    } else if(mResizeType == 2) {
        INTERP_BILINEAR<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
            mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);       
+    } else if (mResizeType == 4) {
+        INTERP_NERAEST_ROUND<<<block_num, threads_num>>>(mCount, mInputHeight, mInputWidth, mOutputHeight, mOutputWidth,
+            mScaleHeight, mScaleWidth, mHeightOffset, mWidthOffset, (const float *)input_addr, (float *)output_addr);
    }
    return NO_ERROR;
 }
@ -98,7 +234,7 @@ public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const override {
        auto param = op->main_as_Interp();
-        if(param->resizeType() != 1 && param->resizeType() != 2) {
+        if(param->resizeType() == 3) {
            MNN_PRINT("CUDA interp resize type:%d not support, back to CPU\n", param->resizeType());
            return nullptr;
        }
--- a/source/backend/cuda/execution/LayerNormExecution.cu
+++ b/source/backend/cuda/execution/LayerNormExecution.cu
@ -38,7 +38,7 @@ T blockReduceSum(T val)

 template <typename T>
 __global__ 
-void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon, int sumPerKnl)
+void input_layernorm(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon, int sumPerKnl)
 {
  int tid = threadIdx.x;

@ -60,7 +60,7 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int

  float var_tmp = 0.0f;
  for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
-    var_tmp += ((input[blockIdx.x * n + idx*256 + tid] - s_mean) * (input[blockIdx.x * n + idx*256 + tid] - s_mean));
+    var_tmp += (((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * ((float)input[blockIdx.x * n + idx*256 + tid] - s_mean));
  }
  variance += blockReduceSum<float>(var_tmp);
  if(threadIdx.x == 0)
@ -69,14 +69,14 @@ void input_layernorm(T* out, const T* input, const T* gamma, const T* beta, int

  for(int idx=0; idx<sumPerKnl && idx*256 + tid < n; idx++) {
    out[blockIdx.x * n + idx*256+tid] = 
-        (T)(((input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
+        (T)((((float)input[blockIdx.x * n + idx*256 + tid] - s_mean) * rsqrtf(s_variance)) * (float)(__ldg(&gamma[idx*256 + tid])) + (float)(__ldg(&beta[idx*256 + tid])));
  }
 }


 template <typename T>
 __global__ 
-void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_2048(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
  int tid = threadIdx.x;

@ -128,7 +128,7 @@ void input_layernorm_2048(T* out, const T* input, const T* gamma, const T* beta,

 template <typename T>
 __global__ 
-void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_1024(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
  int tid = threadIdx.x;

@ -176,7 +176,7 @@ void input_layernorm_1024(T* out, const T* input, const T* gamma, const T* beta,

 template <typename T>
 __global__ 
-void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta, int m, int n, const float epsilon)
+void input_layernorm_512(T* out, const T* input, const float* gamma, const float* beta, int m, int n, const float epsilon)
 {
  int tid = threadIdx.x;

@ -217,25 +217,25 @@ void input_layernorm_512(T* out, const T* input, const T* gamma, const T* beta,

 template<typename T>
 __global__ void LAYERNORM(const int count, const int outside, const int inside, const float epsilon, 
-                          const T* in, T* out, const T* gamma_data, const T* beta_data) {
+                          const T* in, T* out, const float* gamma_data, const float* beta_data) {
    CUDA_KERNEL_LOOP(i, count) {
        const int o = i / inside;
        const int index = i % inside;
        const T* inner_input = in + o * inside;
        T* inner_output = out + o * inside;
-        T sum = 0.f;
+        float sum = 0.f;
        for (int j = 0; j < inside; ++j) {
-            sum += inner_input[j];
+            sum += (float)inner_input[j];
        }
-        T mean = sum / inside;
-        T square_sum = 0.f;
+        float mean = sum / inside;
+        float square_sum = 0.f;
        for (int j = 0; j < inside; ++j) {
-            square_sum += (inner_input[j] - mean) * (inner_input[j] - mean);
+            square_sum += ((float)inner_input[j] - mean) * ((float)inner_input[j] - mean);
        }
-        T variable = square_sum / inside;
+        float variable = square_sum / inside;
        variable = 1.f / sqrt(variable + epsilon);

-        inner_output[index] = (inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
+        inner_output[index] = ((float)inner_input[index] - mean) * variable * gamma_data[index] + beta_data[index];
    }
 }

@ -249,7 +249,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
    mEps = layer_norm_param->epsilon();

    int size = layer_norm_param->gamma()->size();
-    mGammaTensor.reset(Tensor::createDevice<float>({size}));
+    mGammaTensor.reset(Tensor::createDevice<int32_t>({size}));
    auto status = backend->onAcquireBuffer(mGammaTensor.get(), Backend::STATIC);
    if (!status) {
        MNN_ERROR("Out of memory when gamma is acquired in CudaLayerNorm.\n");
@ -262,7 +262,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen
    if (layer_norm_param->beta()->size() != size) {
        MNN_ERROR("Size of gamma and beta are not match in CudaLayerNorm.\n");
    }
-    mBetaTensor.reset(Tensor::createDevice<float>({size}));
+    mBetaTensor.reset(Tensor::createDevice<int32_t>({size}));
    status = backend->onAcquireBuffer(mBetaTensor.get(), Backend::STATIC);
    if (!status) {
        MNN_ERROR("Out of memory when beta is acquired in CudaLayerNorm.\n");
@ -274,12 +274,7 @@ LayerNormExecution::LayerNormExecution(const LayerNorm* layer_norm_param, Backen

 }
 LayerNormExecution::~LayerNormExecution() {
-    if (nullptr != mGammaTensor) {
-        backend()->onReleaseBuffer(mGammaTensor.get(), Backend::STATIC);
-    }
-    if (nullptr != mBetaTensor) {
-        backend()->onReleaseBuffer(mBetaTensor.get(), Backend::STATIC);
-    }
+    // Do nothing
 }

 ErrorCode LayerNormExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
@ -314,6 +309,28 @@ ErrorCode LayerNormExecution::onExecute(const std::vector<Tensor *> &inputs, con
    int threads_num = runtime->threads_num();
    auto input_addr = (void*)inputs[0]->deviceId();
    auto output_addr = (void*)outputs[0]->deviceId();
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        if(mInside < 128) {
+            LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const half *)input_addr, (half *)output_addr,
+                    (const float *)mDeviceGamma, (const float *)mDeviceBeta);
+        } else {
+            if(mInside == 2048) {
+                input_layernorm_2048<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else if(mInside == 1024) {
+                input_layernorm_1024<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else if(mInside == 512) {
+                input_layernorm_512<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps);
+            } else {
+                int sumPerKnl = (mInside+255) / 256;
+                input_layernorm<<<mOutside, 256>>>((half *)output_addr, (const half *)input_addr, (const float *)mDeviceGamma, 
+                    (const float *)mDeviceBeta, mOutside, mInside, mEps, sumPerKnl);
+            }
+        }
+        return NO_ERROR;
+    }

    if(mInside < 128) {
        LAYERNORM<<<block_num, threads_num>>>(mOutside*mInside, mOutside, mInside, mEps, (const float *)input_addr, (float *)output_addr,
--- a/source/backend/cuda/execution/LoopExecution.cpp
+++ b/source/backend/cuda/execution/LoopExecution.cpp
@ -6,7 +6,6 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include <map>
-#include "BatchMatMulExecution.hpp"
 #include "MatMulExecution.hpp"
 #include "backend/cuda/core/CUDABackend.hpp"
 #include "Raster.cuh"
@ -34,18 +33,21 @@ public:
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();
            if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
-                auto& unit = mExecutions[0];
-                unit.exe.reset(new BatchMatMulExecution(op->main_as_MatMul()->transposeA(),  op->main_as_MatMul()->transposeB(), backend()));
-                if (nullptr == unit.exe) {
-                    return OUT_OF_MEMORY;
-                } 
-                unit.inputs = inputs;
-                unit.outputs = outputs;
-                auto code = unit.exe->onResize(unit.inputs, unit.outputs);
-                if (NO_ERROR != code) {
-                    return code;
+                if (inputs.size() <= 3) {
+                    auto& unit = mExecutions[0];
+                    unit.exe.reset(new MatMulExecution(op->main_as_MatMul()->transposeA(),  op->main_as_MatMul()->transposeB(), backend()));
+                    if (nullptr == unit.exe) {
+                        return OUT_OF_MEMORY;
+                    }
+                    unit.inputs = inputs;
+                    unit.outputs = outputs;
+                    auto code = unit.exe->onResize(unit.inputs, unit.outputs);
+                    if (NO_ERROR != code) {
+                        return code;
+                    }
+                    mSingleMatMul = true;
+                    return NO_ERROR;
                }
-                return NO_ERROR;
            }
        }

@ -134,21 +136,22 @@ public:

    virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
        auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+        if (mSingleMatMul) {
+            auto& unit = mExecutions[0];
+            unit.inputs = originInputs;
+            unit.outputs = originOutputs;
+
+            auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
+            if (NO_ERROR != code) {
+                return code;
+            }
+            return NO_ERROR;
+        }
        if (1 == mLoop->commands()->size()) {
            auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
            auto op = cmd->op();

-            if (OpType_MatMul == op->type() && mLoop->parallel() && mLoop->loopNumber() > 1) {
-                auto& unit = mExecutions[0];
-                unit.inputs = originInputs;
-                unit.outputs = originOutputs;

-                auto code = unit.exe->onExecute(unit.inputs, unit.outputs);
-                if (NO_ERROR != code) {
-                    return code;
-                }
-                return NO_ERROR;
-            }

            if (OpType_UnaryOp == op->type() && nullptr == op->main()) {
                Tensor::InsideDescribe::Region reg;
@ -160,7 +163,7 @@ public:
                auto input = mStack[cmd->indexes()->data()[1]];
                auto inputSize = input->elementSize();
                auto output = mStack[cmd->indexes()->data()[0]];
-                auto bytes = input->getType().bytes();
+                auto bytes = static_cast<CUDABackend*>(backend())->getBytes(input);
                auto step0 = cmd->steps()->data()[0];
                auto step1 = cmd->steps()->data()[1];
                auto loopNumber = mLoop->loopNumber();
@ -189,7 +192,7 @@ public:
        for (auto& iter : mIndiceCopy) {
            backend()->onCopyBuffer(iter.first, iter.second);
        }
-        auto bytes = sizeof(float);//TODO: Support Half
+        auto bytes = static_cast<CUDABackend*>(backend())->getBytes(originOutputs[0]);
        for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
            for (int index=0; index<mLoop->commands()->size(); ++index) {
                auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
@ -205,7 +208,7 @@ public:
                    }
                    auto view = cmd->view()->GetAs<View>(v);
                    offset = offset * cmd->steps()->data()[v] + view->offset();
-                    mStackPtr[tensorIndex] = tensor->deviceId() + offset * bytes;
+                    mStackPtr[tensorIndex] = tensor->deviceId() + offset * static_cast<CUDABackend*>(backend())->getBytes(tensor);
                }
                if (OpType_UnaryOp == op->type()) {
                    auto src = (float*)mStackPtr[cmd->indexes()->data()[1]];
@ -233,6 +236,10 @@ public:
                    continue;
                }
                if (OpType_BinaryOp == op->type()) {
+                    auto type = halide_type_of<float>();
+                    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+                        type.bits = 16;
+                    }
                    auto src0 = mStackPtr[cmd->indexes()->data()[1]];
                    auto src1 = mStackPtr[cmd->indexes()->data()[2]];
                    auto dst = mStackPtr[cmd->indexes()->data()[0]];
@ -242,7 +249,7 @@ public:
                    auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();

                    BinaryBlit((uint8_t*)dst, (const uint8_t*)src0, (const uint8_t*)src1,
-                        cmd->size()->data(), srcStride0, srcStride1, dstStride, halide_type_of<float>(), runtime, opType);
+                        cmd->size()->data(), srcStride0, srcStride1, dstStride, type, runtime, opType);

                }
            }
@ -256,6 +263,7 @@ private:
    std::vector<Unit> mExecutions;
    std::vector<uint64_t> mStackPtr;
    std::map<Tensor*, Tensor*> mIndiceCopy;
+    bool mSingleMatMul = false;
 };

 class LoopCreator : public CUDABackend::Creator {
--- a/source/backend/cuda/execution/MNNCUDADefine.hpp
+++ b/source/backend/cuda/execution/MNNCUDADefine.hpp
@ -0,0 +1,18 @@
+#ifndef MNNCUDADEFINE_HPP
+#define MNNCUDADEFINE_HPP
+
+#define PACK_NUMBER 16
+
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#define MNN_CUDA_HALF2_MIN(a, b)                     \
+    do {                                             \
+        (a).x = __hlt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hlt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+#endif
--- a/source/backend/cuda/execution/MNNCUDAFunction.cuh
+++ b/source/backend/cuda/execution/MNNCUDAFunction.cuh
@ -0,0 +1,38 @@
+#ifndef MNNCUDAFunction_cuh
+#define MNNCUDAFunction_cuh
+
+struct DivModFast {
+    DivModFast(int d = 1)
+    {
+        d_ = (d == 0) ? 1 : d;
+        for (l_ = 0;; ++l_) {
+            if ((1U << l_) >= d_)
+                break;
+        }
+        uint64_t one = 1;
+        uint64_t m   = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
+        m_           = static_cast<uint32_t>(m);
+    }
+
+    __device__ __inline__ int div(int idx) const
+    {
+        uint32_t tm = __umulhi(m_, idx); // get high 32-bit of the product
+        return (tm + idx) >> l_;
+    }
+
+    __device__ __inline__ int mod(int idx) const
+    {
+        return idx - d_ * div(idx);
+    }
+
+    __device__ __inline__ void divmod(int idx, int &quo, int &rem)
+    {
+        quo = div(idx);
+        rem = idx - quo * d_;
+    }
+
+    uint32_t d_; // divisor
+    uint32_t l_; // ceil(log2(d_))
+    uint32_t m_; // m' in the papaer
+};
+#endif
--- a/source/backend/cuda/execution/MatMulExecution.cu
+++ b/source/backend/cuda/execution/MatMulExecution.cu
@ -15,12 +15,18 @@ MatMulExecution::~ MatMulExecution() {
 }

 ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto w0         = inputs[0]->length(1);
-    auto h0         = inputs[0]->length(0);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    auto C = outputs[0];
+    auto dimensions = C->dimensions();
+    int batch = 1;
+    for (int i = 0; i < dimensions - 2; ++i) {
+        batch *= C->length(i);
+    }
+    auto e = C->length(dimensions-2);
+    auto h = C->length(dimensions-1);
+    auto w0 = inputs[0]->length(dimensions-1);
+    auto h0 = inputs[0]->length(dimensions-2);

-    auto e = C->length(0);
-    auto h = C->length(1);
    auto l = w0;
    if (mTransposeA) {
        l = h0;
@ -29,6 +35,7 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
    param.elh[0] = e;
    param.elh[1] = l;
    param.elh[2] = h;
+    param.batch = batch;
    auto eU = UP_DIV(e, PACK_MATMUL);
    auto lU = UP_DIV(l, PACK_MATMUL);
    auto hU = UP_DIV(h, PACK_MATMUL);
@ -58,15 +65,17 @@ ErrorCode MatMulExecution::onResize(const std::vector<Tensor *> &inputs, const s
    param.cStride[0] = h;
    param.cStride[1] = 0;
    param.cStride[2] = 1;
-    param.split[0] = 1;
-    param.split[1] = 1;
-    param.split[2] = 1;
-    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    param.aPStride[0] = 256 * lU;
+    param.aPStride[1] = 16;
+    param.aPStride[2] = 16 * lU;
+    param.bPStride[0] = 256 * lU;
+    param.bPStride[1] = 16;
+    param.bPStride[2] = 16 * lU;
    runtime->memcpy((uint8_t*)mParameters.first + mParameters.second, &param, sizeof(MatMulParam), MNNMemcpyHostToDevice);

    // Alloc for temp buffer
-    auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL;
-    auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL;
+    auto aPackSize = eU * lU * PACK_MATMUL * PACK_MATMUL * batch;
+    auto bPackSize = lU * hU * PACK_MATMUL * PACK_MATMUL * batch;

    auto pool = static_cast<CUDABackend*>(backend())->getBufferPool();
    mTempA = pool->alloc(aPackSize * sizeof(__half), false, 256);
@ -85,6 +94,11 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
    auto APtr = (const float*)A->deviceId();
    auto BPtr = (const float*)B->deviceId();
    auto CDestPtr = (float*)C->deviceId();
+    int e = mParam.elh[0];
+    int l = mParam.elh[1];
+    int h = mParam.elh[2];
+    int batch = mParam.batch;
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);

    auto aP = (__half*)((uint8_t*)mTempA.first + mTempA.second);
    auto bP = (__half*)((uint8_t*)mTempB.first + mTempB.second);
@ -93,53 +107,8 @@ ErrorCode MatMulExecution::onExecute(const std::vector<Tensor *> &inputs, const
        biasPtr = (const float*)inputs[2]->deviceId();
    }
    auto param = (MatMulParam*)((uint8_t*)mParameters.first + mParameters.second);
-    GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP);
-    GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr);
-    return NO_ERROR;
-
-    auto blasHandle = runtime->cublas_handle();
-    auto w0         = inputs[0]->length(1);
-    auto h0         = inputs[0]->length(0);
-
-    auto e = C->length(0);
-    auto h = C->length(1);
-    auto l = w0;
-    if (mTransposeA) {
-        l = h0;
-    }
-
-    float alpha = 1.0f;
-    float beta = 0.0f;
-
-    auto tranB = CUBLAS_OP_N;
-    auto ldB = h;
-    if (mTransposeB) {
-        ldB = l;
-        tranB = CUBLAS_OP_T;
-    }
-    auto tranA = CUBLAS_OP_N;
-    auto ldA = l;
-    if (mTransposeA) {
-        ldA = e;
-        tranA = CUBLAS_OP_T;
-    }
-    int block_num = runtime->blocks_num(e*h);
-    int threads_num = runtime->threads_num();
-    
-    //[e, l] x [l, h] -> [e, h]
-        auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CDestPtr, h);
-        cublas_check(status);
-        //cudaThreadSynchronize();
-    // } else {
-    //     auto CPtr = (float*)mTempOutput->deviceId();
-    //     auto status = cublasSgemm(blasHandle, tranB, tranA, h, e, l, &alpha, BPtr, ldB, APtr, ldA, &beta, CPtr, h);
-    //     cublas_check(status);
-    //     //cudaThreadSynchronize();
-
-    //     //bias: [e, h] + [h] -> [e, h]
-    //     add_bias<<<block_num, threads_num>>>((float*)CPtr, (float*)CDestPtr, (const float*)inputs[2]->deviceId(), e, h);
-    // }
-
+    GemmPrepareRerange(runtime, &mParam, param, APtr, aP, BPtr, bP, bytes);
+    GemmPackedMain(runtime, &mParam, param, CDestPtr, aP, bP, biasPtr, bytes, false, false);
    return NO_ERROR;
 }

--- a/source/backend/cuda/execution/MatMulExecution.hpp
+++ b/source/backend/cuda/execution/MatMulExecution.hpp
@ -28,6 +28,7 @@ private:
    std::pair<void*, int> mTempB;
    std::pair<void*, int> mParameters; // In GPU
    MatMulParam mParam; // In CPU
+    bool mUseBlas = false;
 };
 } // namespace CUDA
 } // namespace MNN
--- a/source/backend/cuda/execution/PReLUExecution.cu
+++ b/source/backend/cuda/execution/PReLUExecution.cu
@ -1,62 +1,71 @@
 #include "PReLUExecution.hpp"
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
-
 #define CUDA_KERNEL_LOOP(i, n) for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

 template<typename T>
 __global__ void PRELU(const int n, const int channels, const int dim, const T* in, T* out,
-                        const T* slopeData, int div_factor) {
-    CUDA_KERNEL_LOOP(index, n) {
+                        const float* slopeData, int div_factor) {
+    CUDA_KERNEL_LOOP(t, n) {
+        int index = t / PACK_NUMBER;
+        int r = t % PACK_NUMBER;
        int c      = (index / dim) % channels / div_factor;
-        out[index] = in[index] > 0 ? in[index] : in[index]*slopeData[c];
+        float iv = (float)in[t];
+        float ov = iv > 0.0 ? iv : iv * slopeData[c * PACK_NUMBER + r];
+        out[t] = (T)ov;
    }
 }

 PReLUExecution::PReLUExecution(const PRelu* prelu, Backend *backend) : Execution(backend) {
    int slopCount = prelu->slope()->size();
    auto alphaData = prelu->slope()->data();
-    preluTensor.reset(Tensor::createDevice<float>({slopCount}));
-    backend->onAcquireBuffer(preluTensor.get(), Backend::STATIC);
-    mDeviceSlope = (void *)preluTensor.get()->buffer().device;
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    auto slopeSize = UP_DIV(slopCount, PACK_NUMBER) * PACK_NUMBER * sizeof(float);
+    mPreluStorage = staticPool->alloc(slopeSize);
+    mDeviceSlope = (uint8_t*)mPreluStorage.first + mPreluStorage.second;

    MNN_ASSERT(nullptr != mDeviceSlope);
+    cudaMemset(mDeviceSlope, 0, slopeSize);
    cudaMemcpy(mDeviceSlope, alphaData, slopCount * sizeof(float), cudaMemcpyHostToDevice);
    mIsChannelShared = slopCount == 1;
-
 }
 PReLUExecution::~PReLUExecution() {
-    if (nullptr != preluTensor) {
-        backend()->onReleaseBuffer(preluTensor.get(), Backend::STATIC);
-    }
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mPreluStorage);
 }

 ErrorCode PReLUExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    MNN_ASSERT(inputs.size() == 1);
    MNN_ASSERT(outputs.size() == 1);
    auto input = inputs[0];
-    mBatch     = input->length(0);
-    mChannel   = input->length(1);
    MNN_ASSERT(input->dimensions() >= 2);
-    mArea      = 1;
+    mArea      = input->length(0);
    for (int i = 2; i < input->dimensions(); ++i) {
        mArea *= input->length(i);
    }
-    mCount = mBatch*mChannel*mArea;
+    mChannel = UP_DIV(input->length(1), PACK_NUMBER);
+    mCount = mChannel*mArea * PACK_NUMBER;
    //printf("mBatch:%d- mChannel:%d- mArea:%d- mCount:%d\n", mBatch,mChannel,mArea, mCount);
    return NO_ERROR;
 }

 ErrorCode PReLUExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    auto bytes = static_cast<CUDABackend*>(backend())->getBytes(inputs[0]);
 
    int block_num = runtime->blocks_num(mCount);
    int threads_num = runtime->threads_num();
    auto input_addr = (void*)inputs[0]->deviceId();
    auto output_addr = (void*)outputs[0]->deviceId();
    int div_factor = mIsChannelShared ? mChannel : 1;
-    PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
-        (const float *)mDeviceSlope, div_factor);
+    if (2 == bytes) {
+        PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const half *)input_addr, (half *)output_addr,
+            (const float *)mDeviceSlope, div_factor);
+    } else {
+        PRELU<<<block_num, threads_num>>>(mCount, mChannel, mArea, (const float *)input_addr, (float *)output_addr,
+            (const float *)mDeviceSlope, div_factor);
+    }
    return NO_ERROR;
 }

--- a/source/backend/cuda/execution/PReLUExecution.hpp
+++ b/source/backend/cuda/execution/PReLUExecution.hpp
@ -29,11 +29,9 @@ private:
    CUDARuntime *mRuntime;
    void *mDeviceSlope = nullptr;
    int mCount;
-    int mBatch;
    int mChannel;
    int mArea;
-
-    std::shared_ptr<Tensor> preluTensor;
+    std::pair<void*, int> mPreluStorage;
    bool mIsChannelShared = false;
 };

--- a/source/backend/cuda/execution/PoolExecution.cu
+++ b/source/backend/cuda/execution/PoolExecution.cu
@ -1,90 +1,209 @@
+#include <cuda_fp16.h>
 #include "PoolExecution.hpp"
+#include <float.h>
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {
-template <typename T>
-__global__ void avgpool(const T* uInput, T* uOutput,
-        int bc,
-        int ih, int iw,
-        int oh, int ow,
-        int padX, int padY,
-        int kernelX, int kernelY,
-        int strideX, int strideY
-        ) {
-    int total = bc * oh * ow;
+#define HALF_MIN  half(-65504)
+#define HALF2_MIN half2(-65504, -65504)
+#define MNN_CUDA_HALF2_MAX(a, b)                     \
+    do {                                             \
+        (a).x = __hgt((a).x, (b).x) ? (a).x : (b).x; \
+        (a).y = __hgt((a).y, (b).y) ? (a).y : (b).y; \
+    } while (0)
+
+__global__ void maxpool_halfC16(const half* uInput, half* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * 8;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
        int x = i % ow;
        int tmp = i / ow;
        int y = tmp % oh;
        int z = tmp / oh;
+        int zC = z / 8;
+        int zR = z % 8;
        int ix = x * strideX - padX;
        int iy = y * strideY - padY;
        int sx = max(0, -ix);
        int sy = max(0, -iy);
        int ex = min(kernelX, iw - ix);
        int ey = min(kernelY, ih - iy);
-        T sumValue = (T)0;
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        half2 sumValue = HALF2_MIN;
        for (int fy=sy; fy<ey; ++fy) {
-            for (int fx=sx; fx<ex; ++fx)
-            {
+            for (int fx=sx; fx<ex; ++fx) {
                int currentX = ix + fx;
                int currentY = iy + fy;
-                T inputColor = uInput[0
-                + z * iw * ih
-                + currentY * iw
-                + currentX
-                ];
-                sumValue = sumValue + inputColor;
+                const half2* input = (const half2*)(uInput
+                    + zR * 2
+                    + currentX * 16
+                    + currentY * iw * 16
+                    + zC * iw * ih * 16
+                );
+                half2 inputV = *input;
+                MNN_CUDA_HALF2_MAX(sumValue, inputV);
            }
        }
-        uOutput[0
-            + z * ow * oh
-            + y * ow
-            + x
-        ] = sumValue / ((T)(ey-sy)*(T)(ex-sx));
+        half2* dst = (half2*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR * 2
+        );
+        *dst = sumValue;
    }
 }
-template <typename T>
-__global__ void maxpool(const T* uInput, T* uOutput,
-        int bc,
-        int ih, int iw,
-        int oh, int ow,
-        int padX, int padY,
-        int kernelX, int kernelY,
-        int strideX, int strideY
-        ) {
-    int total = bc * oh * ow;
+
+__global__ void avgpool_halfC16(const half* uInput, half* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * 8;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
        int x = i % ow;
        int tmp = i / ow;
        int y = tmp % oh;
        int z = tmp / oh;
+        int zC = z / 8;
+        int zR = z % 8;
        int ix = x * strideX - padX;
        int iy = y * strideY - padY;
        int sx = max(0, -ix);
        int sy = max(0, -iy);
        int ex = min(kernelX, iw - ix);
        int ey = min(kernelY, ih - iy);
-        T maxValue = (T)(-1000000);
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        half2 sumValue = half2(0.0f, 0.0f);
+        half2 mulValue = half2(1.0f / div, 1.0f/div);
        for (int fy=sy; fy<ey; ++fy) {
-            for (int fx=sx; fx<ex; ++fx)
-            {
+            for (int fx=sx; fx<ex; ++fx) {
                int currentX = ix + fx;
                int currentY = iy + fy;
-                T inputColor = uInput[0
-                + z * iw * ih
-                + currentY * iw
-                + currentX
-                ];
-                maxValue = max(inputColor, maxValue);
+                const half2* input = (const half2*)(uInput
+                    + zR * 2
+                    + currentX * 16
+                    + currentY * iw * 16
+                    + zC * iw * ih * 16
+                );
+                sumValue = __hadd2(sumValue, (*input) * mulValue);
            }
        }
-        uOutput[0
-            + z * ow * oh
-            + y * ow
-            + x
-        ] = maxValue;
+        half2* dst = (half2*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR * 2
+        );
+        *dst = sumValue;
    }
 }
+
+
+
+__global__ void maxpool_floatC16(const float* uInput, float* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * PACK_NUMBER;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int x = i % ow;
+        int tmp = i / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int zC = z / PACK_NUMBER;
+        int zR = z % PACK_NUMBER;
+        int ix = x * strideX - padX;
+        int iy = y * strideY - padY;
+        int sx = max(0, -ix);
+        int sy = max(0, -iy);
+        int ex = min(kernelX, iw - ix);
+        int ey = min(kernelY, ih - iy);
+        float maxValue = -FLT_MAX;
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = ix + fx;
+                int currentY = iy + fy;
+                const float* input = (const float*)(uInput
+                    + zR
+                    + currentX * PACK_NUMBER
+                    + currentY * iw * PACK_NUMBER
+                    + zC * iw * ih * PACK_NUMBER
+                );
+                maxValue = max(maxValue, *input);
+            }
+        }
+        float* dst = (float*)(uOutput
+            + zC * ow * oh * PACK_NUMBER
+            + y * ow * PACK_NUMBER
+            + x * PACK_NUMBER
+            + zR
+        );
+        *dst = maxValue;
+    }
+}
+
+__global__ void avgpool_floatC16(const float* uInput, float* uOutput,
+    int bc,
+    int ih, int iw,
+    int oh, int ow,
+    int padX, int padY,
+    int kernelX, int kernelY,
+    int strideX, int strideY
+    ) {
+    int total = bc * oh * ow * PACK_NUMBER;
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
+        int x = i % ow;
+        int tmp = i / ow;
+        int y = tmp % oh;
+        int z = tmp / oh;
+        int zC = z / PACK_NUMBER;
+        int zR = z % PACK_NUMBER;
+        int ix = x * strideX - padX;
+        int iy = y * strideY - padY;
+        int sx = max(0, -ix);
+        int sy = max(0, -iy);
+        int ex = min(kernelX, iw - ix);
+        int ey = min(kernelY, ih - iy);
+        float div = (float)(ey-sy)* (float)(ex-sx);
+        float sumValue = 0.0f;
+        float mulValue = 1.0f/div;
+        for (int fy=sy; fy<ey; ++fy) {
+            for (int fx=sx; fx<ex; ++fx) {
+                int currentX = ix + fx;
+                int currentY = iy + fy;
+                const float* input = (const float*)(uInput
+                    + zR
+                    + currentX * PACK_NUMBER
+                    + currentY * iw * PACK_NUMBER
+                    + zC * iw * ih * PACK_NUMBER
+                );
+                sumValue = sumValue + (*input) * mulValue;
+            }
+        }
+        float* dst = (float*)(uOutput
+            + zC * ow * oh * 16
+            + y * ow * 16
+            + x * 16
+            + zR
+        );
+        *dst = sumValue;
+    }
+}
+
 ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto layer       = mParameter;
    int strideWidth  = layer->strideX();
@ -128,34 +247,62 @@ ErrorCode PoolExecution::onResize(const std::vector<Tensor *> &inputs, const std
 ErrorCode PoolExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto iw = inputs[0]->width();
    auto ih = inputs[0]->height();
-    auto bc = inputs[0]->batch() * inputs[0]->channel();
+    auto bc = inputs[0]->batch() * UP_DIV(inputs[0]->channel(), PACK_NUMBER);
    auto ow = outputs[0]->width();
    auto oh = outputs[0]->height();
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    int block_num = runtime->blocks_num(bc * ow * oh);
-    int threads_num = runtime->threads_num();
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    if (static_cast<CUDABackend*>(backend())->useFp16()) {
+        auto inputPtr = (const half*)inputs[0]->deviceId();
+        auto outputPtr = (half*)outputs[0]->deviceId();
+        switch (mPoolType) {
+            case PoolType_AVEPOOL:
+                avgpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    bc, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+            case PoolType_MAXPOOL:
+                maxpool_halfC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+                    bc, 
+                    ih, iw,
+                    oh, ow,
+                    mPaddings[0], mPaddings[1],
+                    mKernels[0], mKernels[1],
+                    mStrides[0], mStrides[1]
+                );
+                return NO_ERROR;
+        }        
+        return NO_ERROR;
+    }
    auto inputPtr = (const float*)inputs[0]->deviceId();
    auto outputPtr = (float*)outputs[0]->deviceId();
    switch (mPoolType) {
        case PoolType_AVEPOOL:
-            avgpool<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+            avgpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
                bc, 
                ih, iw,
                oh, ow,
                mPaddings[0], mPaddings[1],
                mKernels[0], mKernels[1],
                mStrides[0], mStrides[1]
-                );
+            );
            return NO_ERROR;
        case PoolType_MAXPOOL:
-            maxpool<<<block_num, threads_num>>>(inputPtr, outputPtr, 
+            maxpool_floatC16<<<block_num, threads_num>>>(inputPtr, outputPtr, 
                bc, 
                ih, iw,
                oh, ow,
                mPaddings[0], mPaddings[1],
                mKernels[0], mKernels[1],
                mStrides[0], mStrides[1]
-                );
+            );
            return NO_ERROR;
    }
    return NOT_SUPPORT;
--- a/source/backend/cuda/execution/Raster.cu
+++ b/source/backend/cuda/execution/Raster.cu
@ -1,89 +1,22 @@
 #include "Raster.cuh"
 #include "TensorflowOp_generated.h"
+#include <cuda_fp16.h>
+#include "MNNCUDAFunction.cuh"
+
 namespace MNN {
 namespace CUDA {

-template <typename T>
-__global__ void pack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
-    int total = inside * axis * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
-        int x = i % inside;
-        int tmp = i / inside;
-        int y = tmp % axis;
-        int z = tmp / axis;
-        int y4 = y / 4;
-        int yR = y % 4;
-        int dstOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
-        output[dstOffset] = input[i];
-    }
-}
-
-void PackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
-    auto packAxis = (axis + 3) / 4;
-    if (axis % 4 != 0) {
-        runtime->memset(output, 0, inside * packAxis * 4 * outside * bytes);
-    }
-    int block_num = runtime->blocks_num(inside * axis * outside);
-    int threads_num = runtime->threads_num();
-    switch (bytes) {
-        case 4:
-            pack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
-            break;
-        case 2:
-            pack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
-            break;
-        case 1:
-            pack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
-            break;
-        default:
-            break;
-    }
-}
-
-template <typename T>
-__global__ void unpack_c4(const T *input, T *output, int inside, int axis, int outside, int axisC4) {
-    int total = inside * axis * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
-        int x = i % inside;
-        int tmp = i / inside;
-        int y = tmp % axis;
-        int z = tmp / axis;
-        int y4 = y / 4;
-        int yR = y % 4;
-        int srcOffset = 4 * (z * axisC4 * inside + y4 * inside + x) + yR;
-        output[i] = input[srcOffset];
-    }
-}
-void UnpackC4(uint8_t* output, const uint8_t* input, int inside, int axis, int outside, int bytes, CUDARuntime* runtime) {
-    auto packAxis = (axis + 3) / 4;
-    int block_num = runtime->blocks_num(inside * axis * outside);
-    int threads_num = runtime->threads_num();
-    switch (bytes) {
-        case 4:
-            unpack_c4<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside, packAxis);
-            break;
-        case 2:
-            unpack_c4<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output, inside, axis, outside, packAxis);
-            break;
-        case 1:
-            unpack_c4<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, inside, axis, outside, packAxis);
-            break;
-        default:
-            break;
-    }
-}
-
 // Blit don't care offset
 template <typename T>
 __global__ void blitRegion(const T *inputO, T *outputO,
-        int loopCount,
-        const int32_t* dstIndice, const int32_t* srcIndice,
-        int dstUseIndice, int srcUseIndice,
-        int dstStep, int srcStep,int srcLimit,
-        int sizeZ, int sizeY, int sizeX,
-        int strideZ, int strideY, int strideX,
-        int dstStrideZ, int dstStrideY, int dstStrideX
-        ) {
+    int loopCount,
+    const int32_t* dstIndice, const int32_t* srcIndice,
+    int dstUseIndice, int srcUseIndice,
+    int dstStep, int srcStep,int srcLimit,
+    int sizeZ, int sizeY, int sizeX,
+    int strideZ, int strideY, int strideX,
+    int dstStrideZ, int dstStrideY, int dstStrideX
+    ) {
    int total = loopCount;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < total; i += blockDim.x * gridDim.x) {
        int srcOffsetO = i * srcStep;
@ -162,29 +95,66 @@ void BlitWithIndice(uint8_t* output, const uint8_t* input, const int32_t* dstInd
 #define UNARY_FUNC(Name, Func)\
 template<typename T>\
 __global__ void Name(const T *input, T *output,\
-        int sizeZ, int sizeY, int sizeX,\
+        int count,\
+        DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
        int strideZ, int strideY, int strideX,\
        int dstStrideZ, int dstStrideY, int dstStrideX\
        ) { \
-  int count = sizeZ * sizeY * sizeX;\
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
-    int total = sizeZ * sizeY * sizeX;\
-    int ix = i % sizeX;\
-    int tmp = i / sizeX;\
-    int iy = tmp % sizeY;\
-    int iz = tmp / sizeY;\
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
+    int ix, tmp, iy, iz;\
+    sizeX.divmod(i, tmp, ix);\
+    sizeY.divmod(tmp, iz, iy);\
    int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
    int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
    T x = input[srcOffset];\
    output[dstOffset] = Func;\
  }\
 }\
+template<typename T>\
+__global__ void FLOAT##Name(const T *input, T *output,\
+        int count,\
+        DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,\
+        int strideZ, int strideY, int strideX,\
+        int dstStrideZ, int dstStrideY, int dstStrideX\
+        ) { \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {\
+    int ix, tmp, iy, iz;\
+    sizeX.divmod(i, tmp, ix);\
+    sizeY.divmod(tmp, iz, iy);\
+    int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+    int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+    float x = (float)input[srcOffset];\
+    output[dstOffset] = (float)(Func);\
+  }\
+}\
+
+template<typename T>
+__global__ void blit_2(const T *input, T *output,
+    int count,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) { 
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, iz;
+        sizeX.divmod(i, tmp, ix);
+        sizeY.divmod(tmp, iz, iy);
+        int srcOffset = iz * strideZ + iy * strideY + (ix << 1);
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + (ix << 1);
+        int2 * dstF = (int2 *)(output+dstOffset);
+        dstF[0] = ((int2 *)(input+srcOffset))[0];
+    }
+}
+
+struct Bytes512 {
+    int4 x[4];
+};

 UNARY_FUNC(blit, x);
 UNARY_FUNC(ABS, abs(x));
 UNARY_FUNC(EXP, exp(x));
 UNARY_FUNC(NEG, -x);
-UNARY_FUNC(RECIPROCAL, (T)(1.0)/x);
+UNARY_FUNC(RECIPROCAL, (1.0)/x);
 UNARY_FUNC(FLOOR, floor(x));
 UNARY_FUNC(CEIL, ceil(x));
 UNARY_FUNC(SQUARE, x*x);
@ -212,27 +182,68 @@ UNARY_FUNC(HARDSWISH, 1.0/6.0 * x * min(max(x+3.0, 0.0), 6.0));
 UNARY_FUNC(ERF, erf(x));
 UNARY_FUNC(ERFC, erfc(x));
 UNARY_FUNC(ERFINV, erfinv(x));
+UNARY_FUNC(GELU, (1.0f + tanh(0.79788458f * (0.044715f * x * x * x + x))) * x * 0.5f);
+UNARY_FUNC(GELU_STANDARD, (erf(x*0.7071067932881648f)+1.f)*x*0.5);

 void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime) {
    int count = size[0] * size[1] * size[2];
+
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
+
+    //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+    if(bytes == 4 && count > 16384 && size[2] % 2 == 0 && srcStride[2] == 1 && dstStride[2] == 1) {
+        //printf("%d-%d-%d, %d-%d-%d,-%d-%d-%d\n\n", size[0], size[1], size[2], srcStride[0], srcStride[1], srcStride[2], dstStride[0], dstStride[1], dstStride[2]);
+        count /= 2;
+        int block_num = runtime->blocks_num(count);
+        int threads_num = runtime->threads_num();
+        DivModFast sx_2((size[2]/2));
+
+        blit_2<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+            count,
+            sz, sy, sx_2,
+            srcStride[0], srcStride[1],
+            dstStride[0], dstStride[1]);
+        return;
+    }
+    
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();
+
    switch (bytes) {
+        case 64:
+            blit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output,
+                count,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
+        case 32:
+            blit<<<block_num, threads_num>>>((const double4*)input, (double4*)output,
+                count,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
        case 4:
            blit<<<block_num, threads_num>>>((const float*)input, (float*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
        case 2:
            blit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
        case 1:
            blit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
-                size[0], size[1], size[2],
+                count,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
@ -241,59 +252,131 @@ void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, cons
    }
 }

-template<typename T>
-__global__ void fuseblit(const T *input, T *output,
-        int fuseNum, const int32_t* sliceOffset,
-        int sizeZ, int sizeY, int sizeX,
-        int strideZ, int strideY, int strideX,
-        int dstStrideZ, int dstStrideY, int dstStrideX
-        ) {
-    int count = fuseNum*sizeZ * sizeY * sizeX;
-
-    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
-        int j = c / (sizeZ * sizeY * sizeX);
-        int i = c % (sizeZ * sizeY * sizeX);
-        int ix = i % sizeX;
-        int tmp = i / sizeX;
-        int iy = tmp % sizeY;
-        int iz = tmp / sizeY;
+template<typename T0, typename T1>
+__global__ void fuseblit(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY, int strideX,
+    int dstStrideZ, int dstStrideY, int dstStrideX
+    ) {
+    size_t c = blockIdx.x * blockDim.x + threadIdx.x;
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
        int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + ix * strideX;
        int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
        output[dst_offset] = input[src_offset];
    }
+}

+template<typename T0, typename T1>
+__global__ void fuseblit_4(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) {
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
+        int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
+        int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
+        int4* srcF = (int4 *)(input + src_offset);
+        int4* dstF = (int4 *)(output + dst_offset);
+        dstF[0] = srcF[0];
+    }
+}

+template<typename T0, typename T1>
+__global__ void fuseblit_half_4(const T0 *input, T1 *output,
+    int fuseNum, int count, const int32_t* sliceOffset,
+    DivModFast sizeZ, DivModFast sizeY, DivModFast sizeX,
+    int strideZ, int strideY,
+    int dstStrideZ, int dstStrideY
+    ) {
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < count; c += blockDim.x * gridDim.x) {
+        int ix, tmp, iy, tmp2, iz, j;
+        sizeX.divmod(c, tmp, ix);
+        sizeY.divmod(tmp, tmp2, iy);
+        sizeZ.divmod(tmp2, j, iz);
+        int src_offset = sliceOffset[j] + iz * strideZ + iy * strideY + (ix << 2);
+        int dst_offset = sliceOffset[fuseNum+j] + iz * dstStrideZ + iy * dstStrideY + (ix << 2);
+        int2* srcF = (int2 *)(input + src_offset);
+        int2* dstF = (int2 *)(output + dst_offset);
+        dstF[0] = srcF[0];
+    }
 }

 void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime) {
-    int count = size[0] * size[1] * size[2];
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
+
+    int count = fuseNum * size[0] * size[1] * size[2];
+    if(size[2] % 4 == 0 && count > 16384 && srcStride[2] == 1 && dstStride[2] == 1) {
+        //printf("%d-%d-%d, %d-%d-%d-%d\n", size[0], size[1], size[2], srcStride[0], srcStride[1], dstStride[0], dstStride[1]);
+        int count = fuseNum * size[0] * size[1] * size[2] / 4;
+        int numBlocks = runtime->blocks_num(count);
+        int threadsPerBlock = runtime->threads_num();
+        DivModFast sx_4((size[2]/4));
+
+        if(bytes == 4) {
+            fuseblit_4<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx_4,
+                srcStride[0], srcStride[1],
+                dstStride[0], dstStride[1]);
+            return;
+        } else if(bytes == 2){
+            fuseblit_half_4<<<numBlocks, threadsPerBlock>>>((const half*)input, (half*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx_4,
+                srcStride[0], srcStride[1],
+                dstStride[0], dstStride[1]);
+            return;
+        }
+    }
+
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();

-    int numBlocks = block_num;
-    int threadsPerBlock = threads_num;
-    // dim3 numBlocks(block_num, fuseNum);
-    // dim3 threadsPerBlock(threads_num, 1);
-
    switch (bytes) {
+        case 64:
+            fuseblit<<<block_num, threads_num>>>((const Bytes512*)input, (Bytes512*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
+        case 16:
+            fuseblit<<<block_num, threads_num>>>((const int4*)input, (int4*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
+                srcStride[0], srcStride[1], srcStride[2],
+                dstStride[0], dstStride[1], dstStride[2]);
+            break;
        case 4:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const float*)input, (float*)output, 
-                fuseNum, (const int32_t*)sliceOffset,
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
        case 2:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const int16_t*)input, (int16_t*)output,
-                fuseNum, (const int32_t*)sliceOffset,
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const int16_t*)input, (int16_t*)output,
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
        case 1:
-            fuseblit<<<numBlocks, threadsPerBlock>>>((const int8_t*)input, (int8_t*)output,
-                fuseNum, (const int32_t*)sliceOffset, 
-                size[0], size[1], size[2],
+            fuseblit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output,
+                fuseNum, count, (const int32_t*)sliceOffset,
+                sz, sy, sx,
                srcStride[0], srcStride[1], srcStride[2],
                dstStride[0], dstStride[1], dstStride[2]);
            break;
@ -303,18 +386,112 @@ void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size,
    //printf("%s, %d-%d-%d-%d\n", cudaGetErrorString(cudaGetLastError()), numBlocks.x, numBlocks.y, threadsPerBlock.x, threadsPerBlock.y);
 }

+template<typename T0, typename T1>
+__global__ void fuseblitLimit(const T0 *input, T1 *output,
+    const FuseRegion* info, const int32_t* sliceOffset
+    ) {
+    int sizeZ = info->size[0];
+    int sizeY = info->size[1];
+    int sizeX = info->size[2];
+    int strideZ = info->srcStride[0];
+    int strideY = info->srcStride[1];
+    int strideX = info->srcStride[2];
+    int dstStrideZ = info->dstStride[0];
+    int dstStrideY = info->dstStride[1];
+    int dstStrideX = info->dstStride[2];
+    int fuseNum = info->fuseNumber;
+
+    int count = fuseNum*sizeZ * sizeY * sizeX;
+
+    for (size_t c = blockIdx.x * blockDim.x + threadIdx.x; c < (count); c += blockDim.x * gridDim.x) {
+        int j = c / (sizeZ * sizeY * sizeX);
+        int i = c % (sizeZ * sizeY * sizeX);
+        int ix = i % sizeX;
+        int tmp = i / sizeX;
+        int iy = tmp % sizeY;
+        int iz = tmp / sizeY;
+        const int* srcOffsetPtr = sliceOffset + 8 * j;
+        const int* dstOffsetPtr = sliceOffset + 8 * j + 4;
+        T0 srcValue = (T0)0;
+        int src_offset = srcOffsetPtr[3] + iz * strideZ + iy * strideY + ix * strideX;
+        if (srcOffsetPtr[0] > iz && srcOffsetPtr[1] > iy && srcOffsetPtr[2] > ix) {
+            srcValue = input[src_offset];
+        }
+        int dst_offset = dstOffsetPtr[3] + iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;
+        //printf("%d -> %d - %f\n", src_offset, dst_offset, srcValue);
+        if (dstOffsetPtr[0] > iz && dstOffsetPtr[1] > iy && dstOffsetPtr[2] > ix) {
+            output[dst_offset] = srcValue;
+        }
+    }
+}
+void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (half*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (float*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+        info, (const int32_t*)sliceOffset);
+}
+
+void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes) {
+    auto& prop = runtime->prop();
+    int threads_num = prop.maxThreadsPerBlock;
+    int block_num = prop.multiProcessorCount;
+    switch (bytes) {
+        case 4:
+            fuseblitLimit<<<block_num, threads_num>>>((const float*)input, (float*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        case 2:
+            fuseblitLimit<<<block_num, threads_num>>>((const half*)input, (half*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        case 1:
+            fuseblitLimit<<<block_num, threads_num>>>((const int8_t*)input, (int8_t*)output, 
+                info, (const int32_t*)sliceOffset);
+            break;
+        default:
+            break;
+    }
+}
+
+
 void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
    int count = size[0] * size[1] * size[2];
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();
+    DivModFast sz(size[0]);
+    DivModFast sy(size[1]);
+    DivModFast sx(size[2]);
    // TODO: Support FP16
-    MNN_ASSERT(bytes==4);
    #define COMPUTE(TYPE)\
    if (opType == MNN::UnaryOpOperation_##TYPE ) {\
-            TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
-                size[0], size[1], size[2],\
+        if(bytes==2) {\
+            FLOAT##TYPE<<<block_num, threads_num>>>((const half*)input, (half*)output,\
+                count, \
+                sz, sy, sx,\
                srcStride[0], srcStride[1], srcStride[2],\
                dstStride[0], dstStride[1], dstStride[2]);\
+        } else {\
+            TYPE<<<block_num, threads_num>>>((const float*)input, (float*)output,\
+                count, \
+                sz, sy, sx,\
+                srcStride[0], srcStride[1], srcStride[2],\
+                dstStride[0], dstStride[1], dstStride[2]);\
+        }\
        return;\
    }\

@ -330,6 +507,8 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
    COMPUTE(SIN);
    COMPUTE(COS);
    COMPUTE(TAN);
+    COMPUTE(GELU);
+    COMPUTE(GELU_STANDARD);
    COMPUTE(ASIN);
    COMPUTE(ACOS);
    COMPUTE(ATAN);
@ -356,26 +535,126 @@ void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const
 #define BINARY_FUNC(Name, Func)\
 template<typename TIn, typename TOut>\
 __global__ void Binary##Name(\
-        const TIn *input0, const TIn* input1, TOut *output,\
-        int sizeZ, int sizeY, int sizeX,\
-        int strideZ, int strideY, int strideX,\
-        int strideZ1, int strideY1, int strideX1,\
-        int dstStrideZ, int dstStrideY, int dstStrideX\
-        ) { \
-  int count = sizeZ * sizeY * sizeX;\
-  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
-    int total = sizeZ * sizeY * sizeX;\
-    int ix = i % sizeX;\
-    int tmp = i / sizeX;\
-    int iy = tmp % sizeY;\
-    int iz = tmp / sizeY;\
-    int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
-    int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
-    int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
-    TIn x = input0[srcOffset];\
-    TIn y = input1[srcOffset1];\
-    output[dstOffset] = (TOut)Func;\
-  }\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ, int sizeY, int sizeX,\
+    int strideZ, int strideY, int strideX,\
+    int strideZ1, int strideY1, int strideX1,\
+    int dstStrideZ, int dstStrideY, int dstStrideX\
+    ) { \
+    int count = sizeZ * sizeY * sizeX;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int total = sizeZ * sizeY * sizeX;\
+        int ix = i % sizeX;\
+        int tmp = i / sizeX;\
+        int iy = tmp % sizeY;\
+        int iz = tmp / sizeY;\
+        int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+        int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+        TIn x = input0[srcOffset];\
+        TIn y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)Func;\
+    }\
+}\
+
+#define BINARY_FUNC_FLOATMID(Name, Func)\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMid##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ, int sizeY, int sizeX,\
+    int strideZ, int strideY, int strideX,\
+    int strideZ1, int strideY1, int strideX1,\
+    int dstStrideZ, int dstStrideY, int dstStrideX\
+    ) { \
+    int count = sizeZ * sizeY * sizeX;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int total = sizeZ * sizeY * sizeX;\
+        int ix = i % sizeX;\
+        int tmp = i / sizeX;\
+        int iy = tmp % sizeY;\
+        int iz = tmp / sizeY;\
+        int srcOffset = iz * strideZ + iy * strideY + ix * strideX;\
+        int srcOffset1 = iz * strideZ1 + iy * strideY1 + ix * strideX1;\
+        int dstOffset = iz * dstStrideZ + iy * dstStrideY + ix * dstStrideX;\
+        float x = input0[srcOffset];\
+        float y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)(Func);\
+    }\
+}\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinear##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int sizeZ,\
+    int strideZ,\
+    int strideZ1,\
+    int dstStrideZ\
+    ) { \
+    int count = sizeZ;\
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz * strideZ;\
+        int srcOffset1 = iz * strideZ1;\
+        int dstOffset = iz * dstStrideZ;\
+        float x = input0[srcOffset];\
+        float y = input1[srcOffset1];\
+        output[dstOffset] = (TOut)(Func);\
+    }\
+}\
+
+#define BINARY_FUNC_FLOATMID4(Name, Func)\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinear4_##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int count_4\
+    ) { \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz << 2;\
+        int srcOffset1 = iz << 2;\
+        int dstOffset = iz << 2;\
+        float4 xx = ((float4 *)(input0+srcOffset))[0];\
+        float4 yy = ((float4 *)(input1+srcOffset1))[0];\
+        float x = xx.x;\
+        float y = yy.x;\
+        output[dstOffset] = (TOut)(Func);\
+        x = xx.y;\
+        y = yy.y;\
+        output[dstOffset+1] = (TOut)(Func);\
+        x = xx.z;\
+        y = yy.z;\
+        output[dstOffset+2] = (TOut)(Func);\
+        x = xx.w;\
+        y = yy.w;\
+        output[dstOffset+3] = (TOut)(Func);\
+    }\
+}\
+template<typename TIn, typename TOut>\
+__global__ void BinaryMidLinearHalf4_##Name(\
+    const TIn *input0, const TIn* input1, TOut *output,\
+    int count_4\
+    ) { \
+    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count_4); i += blockDim.x * gridDim.x) {\
+        int iz = i;\
+        int srcOffset = iz << 2;\
+        int srcOffset1 = iz << 2;\
+        int dstOffset = iz << 2;\
+        half2 xx = ((half2 *)(input0+srcOffset))[0];\
+        half2 yy = ((half2 *)(input1+srcOffset1))[0];\
+        float x = (float)xx.x;\
+        float y = (float)yy.x;\
+        output[dstOffset] = (TOut)(Func);\
+        x = (float)xx.y;\
+        y = (float)yy.y;\
+        output[dstOffset+1] = (TOut)(Func);\
+        xx = ((half2 *)(input0+srcOffset))[1];\
+        yy = ((half2 *)(input1+srcOffset1))[1];\
+        x = (float)xx.x;\
+        y = (float)yy.x;\
+        output[dstOffset+2] = (TOut)(Func);\
+        x = (float)xx.y;\
+        y = (float)yy.y;\
+        output[dstOffset+3] = (TOut)(Func);\
+    }\
 }\

 #define sign(y) ((y) > 0 ? 1 : ((y) < 0 ? -1 : 0))
@ -398,44 +677,107 @@ BINARY_FUNC(FLOORMOD, x - floor(x / y) * y);
 BINARY_FUNC(SquaredDifference, (x-y)*(x-y));
 BINARY_FUNC(POW, pow(x, y));
 BINARY_FUNC(ATAN2, atan2(x, y));
-BINARY_FUNC(MOD, x - x / y);
+BINARY_FUNC(MOD, (x % y));
 BINARY_FUNC(LOGICALOR, (x || y) ? 1 : 0);

-void BinaryBlitTemplateFloat(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
+BINARY_FUNC_FLOATMID(ADD, x+y);
+BINARY_FUNC_FLOATMID(SUB, x-y);
+BINARY_FUNC_FLOATMID(MUL, x*y);
+BINARY_FUNC_FLOATMID(DIV, x/y);
+BINARY_FUNC_FLOATMID(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_FUNC_FLOATMID(MINIMUM, min(x, y));
+BINARY_FUNC_FLOATMID(MAXIMUM, max(x, y));
+BINARY_FUNC_FLOATMID(GREATER, x > y ? 1 : 0);
+BINARY_FUNC_FLOATMID(LESS, x < y ? 1 : 0);
+BINARY_FUNC_FLOATMID(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_FUNC_FLOATMID(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_FUNC_FLOATMID(EQUAL, x == y ? 1 : 0);
+BINARY_FUNC_FLOATMID(NOTEQUAL, x != y ? 1 : 0);
+BINARY_FUNC_FLOATMID(FLOORDIV, floor(x / y));
+BINARY_FUNC_FLOATMID(FLOORMOD, x - floor(x / y) * y);
+BINARY_FUNC_FLOATMID(SquaredDifference, (x-y)*(x-y));
+BINARY_FUNC_FLOATMID(POW, pow(x, y));
+BINARY_FUNC_FLOATMID(ATAN2, atan2(x, y));
+BINARY_FUNC_FLOATMID(MOD, fmod(x, y));
+BINARY_FUNC_FLOATMID(LOGICALOR, (x || y) ? 1 : 0);
+
+BINARY_FUNC_FLOATMID4(ADD, x+y);
+BINARY_FUNC_FLOATMID4(SUB, x-y);
+BINARY_FUNC_FLOATMID4(MUL, x*y);
+BINARY_FUNC_FLOATMID4(DIV, x/y);
+BINARY_FUNC_FLOATMID4(REALDIV, (float)sign(y) * x / max(abs(y), 0.0000001));
+BINARY_FUNC_FLOATMID4(MINIMUM, min(x, y));
+BINARY_FUNC_FLOATMID4(MAXIMUM, max(x, y));
+BINARY_FUNC_FLOATMID4(GREATER, x > y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(LESS, x < y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(LESS_EQUAL, x <= y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(GREATER_EQUAL, x >= y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(EQUAL, x == y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(NOTEQUAL, x != y ? 1 : 0);
+BINARY_FUNC_FLOATMID4(FLOORDIV, floor(x / y));
+BINARY_FUNC_FLOATMID4(FLOORMOD, x - floor(x / y) * y);
+BINARY_FUNC_FLOATMID4(SquaredDifference, (x-y)*(x-y));
+BINARY_FUNC_FLOATMID4(POW, pow(x, y));
+BINARY_FUNC_FLOATMID4(ATAN2, atan2(x, y));
+BINARY_FUNC_FLOATMID4(MOD, fmod(x, y));
+BINARY_FUNC_FLOATMID4(LOGICALOR, (x || y) ? 1 : 0);
+
+template<typename T>
+void BinaryBlitTemplateFloat(T* output, const T* input, const T* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
    int count = size[0] * size[1] * size[2];
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();
-    // TODO: Support FP16
-    MNN_ASSERT(bytes==4);
    #define COMPUTE_FLOAT(TYPE, TOut)\
-    if (opType == MNN::BinaryOpOperation_##TYPE ) {\
-            Binary##TYPE<<<block_num, threads_num>>>((const float*)input, (const float*)(input1), (TOut*)output,\
-                size[0], size[1], size[2],\
-                srcStride[0], srcStride[1], srcStride[2],\
-                srcStride1[0], srcStride1[1], srcStride1[2],\
-                dstStride[0], dstStride[1], dstStride[2]);\
-        return;\
-    }\
+        if (opType == MNN::BinaryOpOperation_##TYPE ) {\
+            if (size[2] == count) {\
+                if(count % 4 == 0 && count > 16384 && srcStride[2] == 1 && srcStride1[2] == 1 && dstStride[2] == 1) {\
+                    block_num = runtime->blocks_num(count/4);\
+                    threads_num = runtime->threads_num();\
+                    if(bytes == 4) {\
+                        BinaryMidLinear4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                            count/4);\
+                    } else {\
+                        BinaryMidLinearHalf4_##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                            count/4);\
+                    }\
+                } else {\
+                    BinaryMidLinear##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                        size[2],\
+                        srcStride[2],\
+                        srcStride1[2],\
+                        dstStride[2]);\
+                }\
+            } else {\
+                BinaryMid##TYPE<<<block_num, threads_num>>>((const T*)input, (const T*)(input1), (TOut*)output,\
+                    size[0], size[1], size[2],\
+                    srcStride[0], srcStride[1], srcStride[2],\
+                    srcStride1[0], srcStride1[1], srcStride1[2],\
+                    dstStride[0], dstStride[1], dstStride[2]);\
+            }\
+            return;\
+        }\

-    COMPUTE_FLOAT(ADD, float);
-    COMPUTE_FLOAT(SUB, float);
-    COMPUTE_FLOAT(MUL, float);
-    COMPUTE_FLOAT(DIV, float);
-    COMPUTE_FLOAT(REALDIV, float);
-    COMPUTE_FLOAT(MINIMUM, float);
-    COMPUTE_FLOAT(MAXIMUM, float);
+    COMPUTE_FLOAT(ADD, T);
+    COMPUTE_FLOAT(SUB, T);
+    COMPUTE_FLOAT(MUL, T);
+    COMPUTE_FLOAT(DIV, T);
+    COMPUTE_FLOAT(REALDIV, T);
+    COMPUTE_FLOAT(MINIMUM, T);
+    COMPUTE_FLOAT(MAXIMUM, T);
    COMPUTE_FLOAT(GREATER, int);
    COMPUTE_FLOAT(LESS, int);
    COMPUTE_FLOAT(LESS_EQUAL, int);
    COMPUTE_FLOAT(GREATER_EQUAL, int);
    COMPUTE_FLOAT(EQUAL, int);
    COMPUTE_FLOAT(NOTEQUAL, int);
-    COMPUTE_FLOAT(FLOORDIV, float);
-    COMPUTE_FLOAT(FLOORMOD, float);
-    COMPUTE_FLOAT(POW, float);
-    COMPUTE_FLOAT(SquaredDifference, float);
-    COMPUTE_FLOAT(ATAN2, float);
-    COMPUTE_FLOAT(MOD, float);
+    COMPUTE_FLOAT(FLOORDIV, T);
+    COMPUTE_FLOAT(FLOORMOD, T);
+    COMPUTE_FLOAT(POW, T);
+    COMPUTE_FLOAT(SquaredDifference, T);
+    COMPUTE_FLOAT(ATAN2, T);
+    COMPUTE_FLOAT(MOD, T);
+
+    #undef COMPUTE_FLOAT
 }

 void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType) {
@ -472,12 +814,15 @@ void BinaryBlitTemplateInt32(uint8_t* output, const uint8_t* input, const uint8_

 void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType) {
    if (type.code == halide_type_float) {
-        BinaryBlitTemplateFloat(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        if (type.bits == 32) {
+            BinaryBlitTemplateFloat((float*)output, (float*)input, (float*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        } else if (type.bits == 16) {
+            BinaryBlitTemplateFloat((half*)output, (half*)input, (half*)input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
+        }
    } else if (type.code == halide_type_int) {
        BinaryBlitTemplateInt32(output, input, input1, size, srcStride, srcStride1, dstStride, type.bytes(), runtime, opType);
    }
 }

-
 }// namespace CUDA
 }// namespace MNN
--- a/source/backend/cuda/execution/Raster.cuh
+++ b/source/backend/cuda/execution/Raster.cuh
@ -6,11 +6,22 @@ namespace MNN {
 namespace CUDA {
    void RasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime);
    void FuseRasterBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int fuseNum, void* sliceOffset, int bytes, CUDARuntime* runtime);
-    void PackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
-    void UnpackC4(uint8_t* dest, const uint8_t* src, int inside, int axis, int outside, int bytes, CUDARuntime* runtime);
    void BlitWithIndice(uint8_t* dest, const uint8_t* src, const int32_t* dstIndices, const int32_t* srcIndices, int dstUseIndice, int srcUseIndice, int loopCount, int dstStep, int srcStep, int srcLimit, const Tensor::InsideDescribe::Region& reg, int bytes, CUDARuntime* runtime);
    void UnaryBlit(uint8_t* output, const uint8_t* input, const int32_t* size, const int32_t* srcStride, const int32_t* dstStride, int bytes, CUDARuntime* runtime, int opType);
    void BinaryBlit(uint8_t* output, const uint8_t* input, const uint8_t* input1, const int32_t* size, const int32_t* srcStride, const int32_t* srcStride1, const int32_t* dstStride, halide_type_t type, CUDARuntime* runtime, int opType);
+
+    // Offset: 8 * fuseNum, first 4 for src: limitX, limitY, limitZ, offset, second 4 for dst
+    struct FuseRegion {
+        int32_t size[3] = {1, 1, 1};
+        int32_t srcStride[3] = {0, 0, 0};
+        int32_t dstStride[3] = {0, 0, 0};
+        int fuseNumber = 0;
+    };
+    void FuseRasterBlitFloatToHalf(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitHalfToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitFloatToFloat(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime);
+    void FuseRasterBlitCommon(uint8_t* output, const uint8_t* input, const FuseRegion* info, void* sliceOffset, CUDARuntime* runtime, int bytes);
+
 }
 }

--- a/source/backend/cuda/execution/RasterExecution.cpp
+++ b/source/backend/cuda/execution/RasterExecution.cpp
@ -2,35 +2,305 @@
 //  RasterExecution.cpp
 //  MNN
 //
-//  Created by MNN on 2020/07/30.
+//  Created by MNN on b'2020/04/02'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

 #include "RasterExecution.hpp"
-#include "Raster.cuh"
-#include "core/Concurrency.h"
 #include "core/OpCommonUtils.hpp"
+#include "core/BufferAllocator.hpp"
+#include "Raster.cuh"
+#include "Transpose.cuh"
+#include "MNNCUDADefine.hpp"
 namespace MNN {
 namespace CUDA {

-ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+static void getBatchChannelArea(const Tensor* t, int& batch, int& channel, int& area) {
+    batch = t->batch();
+    if (t->dimensions() == 4) {
+        channel = t->channel();
+        area = t->width() * t->height();
+    } else if (t->dimensions() == 3) {
+        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
+        if (format == MNN_DATA_FORMAT_NHWC) {
+            channel = t->length(2);
+            area    = t->length(1);
+        } else {
+            channel = t->length(1);
+            area    = t->length(2);
+        }
+    } else {
+        auto format = TensorUtils::getDescribe(t)->dimensionFormat;
+        if (format == MNN_DATA_FORMAT_NHWC) {
+            for (int i = t->dimensions() - 1; i > 0; i--) {
+                int len = t->length(i);
+                if (len > 1) {
+                    if (channel == 1) {
+                        channel = len;
+                    } else {
+                        area *= len;
+                    }
+                }
+            }
+        } else {
+            for (int i = 1; i < t->dimensions(); i++) {
+                int len = t->length(i);
+                if (len > 1) {
+                    if (channel == 1) {
+                        channel = len;
+                    } else {
+                        area *= len;
+                    }
+                }
+            }
+        }
+    }
+}
+// Detect if the region is a transpose
+static bool _transpose(const Tensor::InsideDescribe::Region& region) {
+    int srcOne = -1, dstOne = -1;
+    for (int i = 0; i < 3; i++) {
+        if (region.src.stride[i] == 1 && region.size[i] != 1) {
+            if (srcOne >= 0 || region.size[i] < 4) {
+                return false;
+            }
+            srcOne = i;
+        }
+        if (region.dst.stride[i] == 1 && region.size[i] != 1) {
+            if (dstOne >= 0 || region.size[i] < 4) {
+                return false;
+            }
+            dstOne = i;
+        }
+    }
+    return srcOne >= 0 && dstOne >= 0 && srcOne != dstOne;
+}
+
+static int _singleConvert(const Tensor::InsideDescribe::Region& region, const Tensor* dest) {
+    auto origin = region.origin;
+    auto srcFormat = TensorUtils::getDescribe(origin)->dimensionFormat;
+    auto dstFormat = TensorUtils::getDescribe(dest)->dimensionFormat;
+    if (srcFormat == dstFormat) {
+        return 0;
+    }
+    if (0 != region.src.offset || 0 != region.dst.offset) {
+        return 0;
+    }
+    int dstBatch = 1, dstChannel = 1, dstArea = 1,
+        srcBatch = 1, srcChannel = 1, srcArea = 1;
+    getBatchChannelArea(origin, srcBatch, srcChannel, srcArea);
+    getBatchChannelArea(dest, dstBatch, dstChannel, dstArea);
+    if (dstBatch != srcBatch) {
+        return 0;
+    }
+    if (dstChannel != srcChannel) {
+        return 0;
+    }
+    if (dstArea != srcArea) {
+        return 0;
+    }
+    auto totalSize = dstBatch * dstChannel * dstArea;
+    int srcSize = 1;
+    int dstSize = 1;
+    int res = 1;
+    for (int i=0; i<3; ++i) {
+        if (region.size[i] == 1) {
+            continue;
+        }
+        if (region.src.stride[i] != region.dst.stride[i]) {
+            if (dstArea == 1) {
+                // Batch / Channel transpose
+                return 0;
+            }
+            res = 2;
+        }
+        srcSize += (region.size[i] - 1) * region.src.stride[i];
+        dstSize += (region.size[i] - 1) * region.dst.stride[i];
+    }
+    if (srcSize != totalSize || dstSize != totalSize ) {
+        return 0;
+    }
+    // Check If it can be described as NHWC <-> NC4HW4 transpose
+    if (2 == res) {
+        int srcChannelStride;
+        int dstChannelStride;
+        int srcAreaStride;
+        int dstAreaStride;
+        if (MNN_DATA_FORMAT_NC4HW4 == srcFormat) {
+            srcChannelStride = srcArea;
+            srcAreaStride = 1;
+            dstChannelStride = 1;
+            dstAreaStride = srcChannel;
+        } else {
+            srcChannelStride = 1;
+            srcAreaStride = srcChannel;
+            dstAreaStride = 1;
+            dstChannelStride = srcArea;
+        }
+        for (int i=0; i<3; ++i) {
+            if (region.size[i] == 1) {
+                continue;
+            }
+            if (region.size[i] == dstBatch) {
+                if (region.src.stride[i] != region.dst.stride[i]) {
+                    return 0;
+                }
+                continue;
+            }
+            if (region.size[i] == srcChannel) {
+                if (region.src.stride[i] != srcChannelStride || region.dst.stride[i] != dstChannelStride) {
+                    return 0;
+                }
+            }
+            if (region.size[i] == srcArea) {
+                if (region.src.stride[i] != srcAreaStride || region.dst.stride[i] != dstAreaStride) {
+                    return 0;
+                }
+            }
+        }
+        return 2;
+    }
+    return 1;
+}
+
+ErrorCode RasterExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    MNN_ASSERT(inputs.size() == 1);
    MNN_ASSERT(outputs.size() == 1);
-    auto input     = inputs[0];
-    auto output    = outputs[0];
-    auto des       = TensorUtils::getDescribe(input);
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto des = TensorUtils::getDescribe(input);
    auto outputDes = TensorUtils::getDescribe(output);
-    mNeedZero      = !TensorUtils::regionIsFull(input);
-    mTempInputCopy.clear();
-
+    mNeedZero = !TensorUtils::regionIsFull(input);
+    mZeroPoint = 0;
+    mTempInput.clear();
+    mFastBlit.clear();
    mFuseRaster.first = false;
-    if(des->regions.size() > 1) {
-        mFuseRaster.first = true;
-        mFuseRaster.second = des->regions.size();
-        auto& slice0 = des->regions[0];
-        for (int i = 1; i < des->regions.size(); ++i) {
+    mTempOutput = nullptr;
+    auto midFormat = MNN_DATA_FORMAT_NCHW;
+    mTempInputCopy.clear();
+    mOutputPtr = output;
+    mFast = false;
+    int pack = PACK_NUMBER;
+    // all_srcFormat == dstFormat == NC4HW4 : Fast Exe
+    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        mFast = true;
+        for (int i=0; i< des->regions.size(); ++i) {
            auto& slice = des->regions[i];
-            if (slice0.origin->deviceId() != slice.origin->deviceId()) {
+            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                mFast = false;
+                break;
+            }
+            if (!OpCommonUtils::canBlitFast(slice, output, pack, true)) {
+                mFast = false;
+                break;
+            }
+        }
+        if (mFast) {
+            for (int i=0; i< des->regions.size(); ++i) {
+                auto& slice = des->regions[i];
+                if (slice.origin == nullptr) {
+                    continue;
+                }
+                Tensor::InsideDescribe::Region newRegion;
+                OpCommonUtils::turnToPackRegion(slice, newRegion, output, pack, true);
+                mFastBlit.emplace_back(std::make_pair(slice.origin, std::move(newRegion)));
+            }
+            return NO_ERROR;
+        }
+    }
+    mSingleConvert = 0;
+    // srcNum == 1 && srcFormat != dstFormat : Single Convert
+    if (des->regions.size() == 1) {
+        mSingleConvert = _singleConvert(des->regions[0], output);
+        if (mSingleConvert > 0) {
+            return NO_ERROR;
+        }
+    }
+    // Acquire Buffer for temp output
+    // TODO: optimize it
+    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
+        mTempOutput.reset(new Tensor);
+        TensorUtils::setupTensorInfo(output, mTempOutput.get(), midFormat);
+    }
+    if (nullptr != mTempOutput) {
+        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
+        if (!res) {
+            return OUT_OF_MEMORY;
+        }
+        mOutputPtr = mTempOutput.get();
+    }
+    // input is NC4HW4 add Convert
+    std::vector<Tensor*> forRelease;
+    for (int i=0; i< des->regions.size(); ++i) {
+        auto& slice = des->regions[i];
+        auto origin = slice.origin;
+        if (slice.mask != 0) {
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+            continue;
+        }
+        // if tensor is not NC4HW4 or has been merged, don't need deal
+        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+            mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+            continue;
+        }
+        // if NC4HW4's C%4 == 0, change convert to transpose and fuse it
+        if (origin->batch() == 1 && origin->channel() % pack == 0) {
+            int channel = origin->channel();
+            int area = 1;
+            // conv3d/pool3d will has 5 dims, area = depth * width * height, otherwise area = width * height
+            for (int d = 2; d < origin->dimensions(); d++) {
+                area *= origin->length(d);
+            }
+            Tensor::InsideDescribe::Region regionTmp;
+            regionTmp.src.offset = 0;
+            regionTmp.src.stride[0] = area * pack;
+            regionTmp.src.stride[1] = 1;
+            regionTmp.src.stride[2] = pack;
+            regionTmp.dst.offset = 0;
+            regionTmp.dst.stride[0] = area * pack;
+            regionTmp.dst.stride[1] = area;
+            regionTmp.dst.stride[2] = 1;
+            regionTmp.size[0] = channel / pack;
+            regionTmp.size[1] = pack;
+            regionTmp.size[2] = area;
+            regionTmp.origin = slice.origin;
+            bool merge = TensorUtils::fuseRegion(regionTmp, slice);
+            if (merge) {
+                // cache the merged tensor
+                slice.mask = 1;
+                mTempInputCopy.emplace_back(std::make_pair(origin, &slice));
+                continue;
+            }
+        }
+        auto cache = static_cast<CUDABackend*>(backend())->getCache();
+        auto tempTensor = cache->findCacheTensor(origin, midFormat);
+        if (nullptr == tempTensor) {
+            std::shared_ptr<Tensor> newTensor(new Tensor);
+            TensorUtils::copyShape(origin, newTensor.get());
+            TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
+            newTensor->buffer().type = origin->getType();
+            TensorUtils::setLinearLayout(newTensor.get());
+            mTempInput.insert(std::make_pair(origin, newTensor.get()));
+            auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
+            if (!res) {
+                return OUT_OF_MEMORY;
+            }
+            tempTensor = newTensor.get();
+            TensorUtils::getDescribe(tempTensor)->useCount = TensorUtils::getDescribe(origin)->useCount;
+            cache->pushCacheTensor(newTensor, origin, midFormat);
+        }
+        if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
+            forRelease.emplace_back(tempTensor);
+        }
+        mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
+    }
+    if(mTempInputCopy.size() > 1) {
+        mFuseRaster.first = true;
+        mFuseRaster.second = mTempInputCopy.size();
+        auto& slice0 = *mTempInputCopy[0].second;
+        for (int i = 1; i < mTempInputCopy.size(); ++i) {
+            auto& slice = *mTempInputCopy[i].second;
+            if (mTempInputCopy[i].first != mTempInputCopy[0].first) {
                mFuseRaster.first = false;
                break;
            }
@ -52,81 +322,141 @@ ErrorCode RasterExecution::onResize(const std::vector<Tensor*>& inputs, const st
            }
        }
    }
-    //mFuseRaster.first = false;
-    if(!mFuseRaster.first) {
-        for (int i = 0; i < des->regions.size(); ++i) {
-            auto& slice = des->regions[i];
-            if (nullptr == slice.origin) {
-                continue;
-            }
-            mTempInputCopy.emplace_back(std::make_pair((void*)slice.origin->deviceId(), &slice));
-        }
-    } else {
-        auto& slice0 = des->regions[0];
-        if (nullptr != slice0.origin) {
-            mTempInputCopy.emplace_back(std::make_pair((void*)slice0.origin->deviceId(), &slice0));
-        }
-
-        int regionSize = des->regions.size();
+    if(mFuseRaster.first) {
+        auto& slice0 = *mTempInputCopy[0].second;
+        auto tensor = mTempInputCopy[0].first;
+        int regionSize = mTempInputCopy.size();
        std::vector<int32_t> temp(2*regionSize, 0);
        for (int i = 0; i < regionSize; ++i) {
-            auto& slice = des->regions[i];
+            auto& slice = *mTempInputCopy[i].second;
            temp[i] = slice.src.offset;
            temp[regionSize+i] = slice.dst.offset;
-            //printf("%d-", tmpSrc[i]);
+            //printf("%d-%d-%d\n", regionSize, temp[i], temp[regionSize+i]);
        }
        //save srcOffset/dstOffset to Device
        offsetTensor.reset(Tensor::createDevice<int32_t>({2*regionSize}));
        backend()->onAcquireBuffer(offsetTensor.get(), Backend::STATIC);
        mOffset = (void *)offsetTensor.get()->buffer().device;
        cuda_check(cudaMemcpy(mOffset, temp.data(), 2*regionSize*sizeof(int32_t), cudaMemcpyHostToDevice));
+        mTempInputCopy.clear();
+        mTempInputCopy.emplace_back(std::make_pair(tensor, &slice0));
+    }
+
+    for (auto t : forRelease) {
+        backend()->onReleaseBuffer(t, Backend::DYNAMIC);
+    }
+    if (nullptr != mTempOutput) {
+        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
    }
    return NO_ERROR;
 }

-ErrorCode RasterExecution::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+void RasterExecution::executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const {
+    auto bn = static_cast<CUDABackend*>(backend());
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto bytes = bn->getBytes(output);
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
-    auto input   = inputs[0];
-    auto output  = outputs[0];
-    auto bytes   = input->getType().bytes();
    if (mNeedZero) {
-        runtime->memset((void*)output->deviceId(), 0, output->size());
+        auto size = static_cast<CUDABackend*>(backend())->realSize(output) * bytes;
+        cudaMemset((uint8_t*)output->deviceId(), 0, size);
+    }
+    // Use mFastBlit
+    for (auto& iter : mFastBlit) {
+        auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second.src.offset * bytes;
+        auto dstPtr = (uint8_t*)output->deviceId() + iter.second.dst.offset * bytes;
+        RasterBlit(dstPtr, srcPtr, iter.second.size, iter.second.src.stride, iter.second.dst.stride, bytes * PACK_NUMBER, runtime);
+    }
+}
+
+
+ErrorCode RasterExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    if (mFast) {
+        executeFaster(inputs, outputs);
+        return NO_ERROR;
+    }
+    auto bn = static_cast<CUDABackend*>(backend());
+    auto input = inputs[0];
+    auto output = outputs[0];
+    auto bytes = bn->getBytes(output);
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    if (mSingleConvert > 0) {
+        auto realInput = TensorUtils::getDescribe(input)->regions[0].origin;
+        int srcBatch = 1, srcChannel = 1, srcArea = 1;
+        getBatchChannelArea(realInput, srcBatch, srcChannel, srcArea);
+        auto sourceFormat = TensorUtils::getDescribe(realInput)->dimensionFormat;
+        auto destFormat = TensorUtils::getDescribe(output)->dimensionFormat;
+        int batchStride = srcChannel * srcArea * bytes;
+        int inputBatchStride = batchStride;
+        int outputBatchStride = batchStride;
+        PackInfo pack;
+        pack.inside = srcArea;
+        pack.axis = srcChannel;
+        pack.unit = PACK_NUMBER;
+        pack.outside = srcBatch;
+        if (mSingleConvert == 1) {
+            pack.axisStride = srcArea;
+            pack.insideStride = 1;
+        } else if (mSingleConvert == 2) {
+            pack.axisStride = 1;
+            pack.insideStride = srcChannel;
+        }
+        auto srcPtr = (void*)realInput->deviceId();
+        auto dstPtr = (void*)output->deviceId();
+        if (MNN_DATA_FORMAT_NC4HW4 == sourceFormat) {
+            if (realInput->dimensions() <= 1) {
+                cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
+                return NO_ERROR;
+            }
+            UnpackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);            
+        } else {
+            if (output->dimensions() <= 1) {
+                cudaMemcpy(dstPtr, srcPtr, bn->realSize(realInput) * bytes, cudaMemcpyDeviceToDevice);
+                return NO_ERROR;
+            }
+            PackBuffer(dstPtr, srcPtr, &pack, bytes, runtime);            
+        }
+        return NO_ERROR;
+    }
+    if (mNeedZero) {
+        auto size = static_cast<CUDABackend*>(backend())->realSize(mOutputPtr) * bytes;
+        cudaMemset((uint8_t*)mOutputPtr->deviceId(), 0, size);
+    }
+    for (auto& iter : mTempInput) {
+        backend()->onCopyBuffer(iter.first, iter.second);
    }
    if(mFuseRaster.first) {
        MNN_ASSERT(mTempInputCopy.size() == 1);
        auto& iter  = mTempInputCopy[0];
        auto& slice = *(iter.second);
-        auto srcPtr = (uint8_t*)iter.first;
-        auto dstPtr = (uint8_t*)output->deviceId();
+        auto srcPtr = (uint8_t*)iter.first->deviceId();
+        auto dstPtr = (uint8_t*)mOutputPtr->deviceId();
        //printf("fuseRaster:%p-%p\n", mSrcOffset, mDstOffset);

        FuseRasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, mFuseRaster.second, mOffset, bytes, runtime);
-        return NO_ERROR;
+    } else {
+        for (auto& iter : mTempInputCopy) {
+            auto srcPtr = (uint8_t*)iter.first->deviceId() + iter.second->src.offset * bytes;
+            auto dstPtr = (uint8_t*)mOutputPtr->deviceId() + iter.second->dst.offset * bytes;
+            RasterBlit(dstPtr, srcPtr, iter.second->size, iter.second->src.stride, iter.second->dst.stride, bytes, runtime);
+        }
    }
-    for (int u = 0; u < mTempInputCopy.size(); ++u) {
-        auto& iter  = mTempInputCopy[u];
-        auto& slice = *(iter.second);
-        auto srcPtr = (uint8_t*)iter.first + slice.src.offset * bytes;
-        auto dstPtr = (uint8_t*)output->deviceId() + slice.dst.offset * bytes;
-        RasterBlit(dstPtr, srcPtr, slice.size, slice.src.stride, slice.dst.stride, bytes, runtime);
+
+    if (nullptr != mTempOutput) {
+        backend()->onCopyBuffer(mTempOutput.get(), output);
    }
    return NO_ERROR;
 }

-RasterExecution::RasterExecution(Backend* backend) : Execution(backend) {
-    // Do nothing
-}
-RasterExecution::~RasterExecution() {
-    // Do nothing
-}
-class RasterCreator : public CUDABackend::Creator {
+class RasterExecutionFactory : public CUDABackend::Creator {
 public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                                const MNN::Op* op, Backend* backend) const override {
+                                const MNN::Op* op, Backend* backend) const {
        return new RasterExecution(backend);
    }
 };

-static CUDACreatorRegister<RasterCreator> __init(OpType_Raster);
-} // namespace CUDA
-} // namespace MNN
+static CUDACreatorRegister<RasterExecutionFactory> __init(OpType_Raster);
+
+}
+}
--- a/source/backend/cuda/execution/RasterExecution.hpp
+++ b/source/backend/cuda/execution/RasterExecution.hpp
@ -2,37 +2,43 @@
 //  RasterExecution.hpp
 //  MNN
 //
-//  Created by MNN on 2020/07/30.
+//  Created by MNN on b'2020/04/02'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-
 #ifndef RasterExecution_hpp
 #define RasterExecution_hpp
-#include <map>
-#include <memory>
-#include <vector>
 #include "backend/cuda/core/CUDABackend.hpp"
-#include "core/Execution.hpp"
+#include <map>
+#include <set>
 #include "core/TensorUtils.hpp"
-
 namespace MNN {
 namespace CUDA {
 class RasterExecution : public Execution {
 public:
-    RasterExecution(Backend *backend);
-    virtual ~RasterExecution();
+    RasterExecution(Backend* bn) : Execution(bn) {
+        // Do nothing
+    }
+    virtual ~ RasterExecution() {
+        // Do nothing
+    }
+    
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-
+    void executeFaster(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) const;
 private:
-    std::vector<std::pair<void *, Tensor::InsideDescribe::Region *>> mTempInputCopy;
+    std::map<Tensor*, Tensor*> mTempInput;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region*>> mTempInputCopy;
+    std::vector<std::pair<const Tensor*, Tensor::InsideDescribe::Region>> mFastBlit;
+    std::shared_ptr<Tensor> mTempOutput;
+    Tensor* mOutputPtr;
    bool mNeedZero = false;
+    bool mFast = false;
+    int mSingleConvert = 0;
+    int32_t mZeroPoint = 0;
    std::pair<bool, int> mFuseRaster;
-
    void *mOffset;
    std::shared_ptr<Tensor> offsetTensor;
 };
-} // namespace CUDA
-} // namespace MNN
-
+}
+}
 #endif
--- a/source/backend/cuda/execution/ReductionExecution.cu
+++ b/source/backend/cuda/execution/ReductionExecution.cu
@ -1,99 +1,19 @@
 #include "ReductionExecution.hpp"
-
 namespace MNN {
 namespace CUDA {

 ReductionExecution::ReductionExecution(ReductionType opType, int axis, Backend *backend) : Execution(backend) {
    mType = opType;
    mAxis = axis;
+    auto staticPool = static_cast<CUDABackend*>(backend)->getStaticBufferPool();
+    mParam = staticPool->alloc(sizeof(ReduceParam));
 }
 ReductionExecution::~ ReductionExecution() {
-    // Do nothing
+    auto staticPool = static_cast<CUDABackend*>(backend())->getStaticBufferPool();
+    staticPool->free(mParam);
 }

-template <typename T>
-__global__ void SUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        T sumValue = (T)0;
-        const T* basicInput = input + y * axis * inside + x;
-        for (int v=0; v<axis; ++v) {
-            sumValue += basicInput[v * inside];
-        }
-        output[y * inside + x] = sumValue;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MEAN(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        T sumValue = (T)0;
-        const T* basicInput = input + y * axis * inside + x;
-        for (int v=0; v<axis; ++v) {
-            sumValue += basicInput[v * inside];
-        }
-        output[y * inside + x] = sumValue / (T)axis;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MINIMUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res = min(basicInput[v * inside], res);
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void MAXIMUM(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res = max(basicInput[v * inside], res);
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-template <typename T>
-__global__ void PROD(const T *input, T *output, int inside, int axis, int outside) {
-    int count = inside * outside;
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (count); i += blockDim.x * gridDim.x) {
-        int y = i / inside;
-        int x = i % inside;
-        const T* basicInput = input + y * axis * inside + x;
-        T res = basicInput[0];
-        for (int v=1; v<axis; ++v) {
-            res *= basicInput[v * inside];
-        }
-        output[y * inside + x] = res;
-    }
-    return;
-}
-
-ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto input = (void*)inputs[0]->deviceId();
-    auto output = (void*)outputs[0]->deviceId();
+ErrorCode ReductionExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
    int inside = 1;
    int outside = 1;
@ -104,52 +24,88 @@ ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, con
    for (int i=mAxis+1; i<inputs[0]->dimensions(); ++i) {
        inside *= inputs[0]->length(i);
    }
+    mCpuParam.inside = inside;
+    mCpuParam.outside = outside;
+    mCpuParam.axis = axis;
+    cuda_check(cudaMemcpy((uint8_t*)mParam.first + mParam.second, &mCpuParam, sizeof(ReduceParam), cudaMemcpyHostToDevice));
+
+    return NO_ERROR;
+}
+
+ErrorCode ReductionExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input = (void*)inputs[0]->deviceId();
+    auto output = (void*)outputs[0]->deviceId();
+    auto runtime = static_cast<CUDABackend*>(backend())->getCUDARuntime();
+    int inside = mCpuParam.inside;;
+    int outside = mCpuParam.outside;
    int count = inside * outside;
    int block_num = runtime->blocks_num(count);
    int threads_num = runtime->threads_num();
+    auto param = (ReduceParam*)((uint8_t*)mParam.first + mParam.second);
    if (inputs[0]->getType() == halide_type_of<float>()) {
-        switch (mType) {
-            case ReductionType_MEAN:
-                MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_SUM:
-                SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_MINIMUM:
-                MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_MAXIMUM:
-                MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
-            case ReductionType_PROD:
-                PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, inside, axis, outside);
-                return NO_ERROR;
+        if (static_cast<CUDABackend*>(backend())->useFp16()) {
+            switch (mType) {
+                case ReductionType_MEAN:
+                    MEAN<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_SUM:
+                    SUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MINIMUM:
+                    MINIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MAXIMUM:
+                    MAXIMUM<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+                case ReductionType_PROD:
+                    PROD<<<block_num, threads_num>>>((const half*)input, (half*)output, param);
+                    return NO_ERROR;
+            }
+        } else {
+            switch (mType) {
+                case ReductionType_MEAN:
+                    MEAN<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_SUM:
+                    SUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MINIMUM:
+                    MINIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_MAXIMUM:
+                    MAXIMUM<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+                case ReductionType_PROD:
+                    PROD<<<block_num, threads_num>>>((const float*)input, (float*)output, param);
+                    return NO_ERROR;
+            }
        }
        MNN_ASSERT(false);
        return NOT_SUPPORT;
    }
+    
    MNN_ASSERT(inputs[0]->getType() == halide_type_of<int32_t>());
    switch (mType) {
        case ReductionType_MEAN:
-            MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MEAN<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_SUM:
-            SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            SUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_MINIMUM:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_MAXIMUM:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_PROD:
-            PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            PROD<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_ANY:
-            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MAXIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
        case ReductionType_ALL:
-            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, inside, axis, outside);
+            MINIMUM<<<block_num, threads_num>>>((const int32_t*)input, (int32_t*)output, param);
            return NO_ERROR;
    }
    MNN_ASSERT(false);
--- a/Show More
+++ b/Show More