Github release 1.1.0

2020-11-05 16:41:56 +08:00 · 2020-11-05 16:41:56 +08:00 · d6795ad031
parent 939a80dba8
commit d6795ad031
1296 changed files with 98954 additions and 55065 deletions
--- a/.gitignore
+++ b/.gitignore
@ -330,7 +330,6 @@ project/android/.idea/caches/build_file_checksums.ser
 # FIXME(haijing): Xcode pre-build stage breaks compilation of flatbuffers by setting envs that do cmake cross-compilation for iOS
 # schema/current
 schema/private
 schema/current
 tools/converter/source/IR
 benchmark/benchmark.txt
@ -345,18 +344,13 @@ pymnn/android/.idea/modules.xml
 pymnn/android/.idea/runConfigurations.xml
 pymnn/android/.idea/vcs.xml
 pymnn/android/.idea/caches/build_file_checksums.ser
 pymnn/src/pybind_private/
 buildios
 build*/
 include/MNN/VCS.h
 source/backend/opencl/execution/cl/codegen/opencl_program.cc
 source/backend/opencl/execution/cl/opencl_program.cc
 # FIXME(haijing): MTL issues.....
 # source/backend/metal/MetalOPRegister.mm
 source/backend/opengl/AllShader.cpp
 include/MNN/backend/opengl/shaders/AllShader.h
 source/backend/vulkan/compiler/AllShader.cpp
 include/MNN/backend/vulkan/shaders/AllShader.h
 .idea
 project/ios/ios_64
 project/ios/ios_32
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,6 +49,7 @@ include(FindPythonInterp REQUIRED)
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
 option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
 option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
 option(MNN_FORBID_MULTI_THREAD "Disable Multi Thread" OFF)
 option(MNN_OPENMP "Use OpenMP's thread pool implementation. Does not work on iOS or Mac OS" OFF)
 option(MNN_USE_THREAD_POOL "Use MNN's own thread pool implementation" ON)
@ -62,14 +63,14 @@ option(MNN_SUPPORT_TFLITE_QUAN "Enable MNN's tflite quantized op" ON)
 option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
 option(MNN_OPENCL_LWS_TUNE "Enable MNN OpenCL Lws Tuning" ON)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression seperately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
 option(NATIVE_INCLUDE_OUTPUT "Native Include Path" OFF)
 option(MNN_AAPL_FMWK "Build MNN.framework instead of traditional .a/.dylib" OFF)
 option(MNN_FMA_ENABLE "x86 routine use fma extension" OFF)
 option(MNN_WITH_PLUGIN "Build with plugin op support." OFF)
 option(MNN_BUILD_MINI "Build MNN-MINI that just supports fixed shape models." OFF)
 option(MNN_USE_SSE "Use SSE optimization for x86 if possiable" ON)
 IF(NOT MNN_BUILD_SHARED_LIBS)
  message(WARNING "Close MNN_SEP_BUILD for static library")
@ -79,13 +80,14 @@ IF(APPLE AND MNN_AAPL_FMWK AND MNN_SEP_BUILD)
  message(WARNING "MNN_SEP_BUILD AND MNN_AAPL_FMWK can't coexist. Turning off MNN_SEP_BUILD")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
 ENDIF()
-IF(MSVC OR WIN32)
+IF(WIN32)
  IF(MNN_SEP_BUILD)
    message(WARNING "MNN_SEP_BUILD IS TROUBLESOME ON Windows. Forcing OFF...")
    SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  ENDIF()
-  SET(MNN_USE_SYSTEM_LIB ON CACHE BOOL "<docstring>" FORCE)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
  IF(MSVC)
    # generate optimized (release) exe and library with pdb debug file, https://stackoverflow.com/a/31264946
    SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
    SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
@ -94,12 +96,13 @@ IF(MSVC OR WIN32)
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
  ENDIF()
 ENDIF()
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
-  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  IF(MNN_BUILD_CONVERTER)
    SET(MNN_PORTABLE_BUILD ON CACHE BOOL "<docstring>" FORCE)
@ -117,6 +120,9 @@ endif()
 if(MNN_SUPPORT_TFLITE_QUAN)
    add_definitions(-DMNN_SUPPORT_TFLITE_QUAN)
 endif()
 if(MNN_BUILD_MINI)
    add_definitions(-DMNN_BUILD_MINI)
 endif()
 # debug options
 if(MNN_DEBUG_MEMORY)
@ -128,9 +134,6 @@ endif()
 if(MNN_GPU_TRACE)
    add_definitions(-DMNN_GPU_FORCE_FINISH)
 endif()
 if(MNN_OPENCL_LWS_TUNE)
    add_definitions(-DMNN_OPENCL_LWS_TUNE)
 endif()
 # backend options
 option(MNN_METAL "Enable Metal" OFF)
@ -138,11 +141,8 @@ option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_ARM82 "Enable ARM82" OFF)
-
+option(MNN_CUDA "Enable CUDA" OFF)
-# codegen register ops
+option(MNN_TENSORRT "Enable TensorRT" OFF)
 if (MNN_METAL)
    add_definitions(-DMNN_CODEGEN_REGISTER)
 endif()
 # target options
 option(MNN_BUILD_BENCHMARK "Build benchmark or not" OFF)
@ -165,11 +165,13 @@ message(STATUS "\tOpenCL: ${MNN_OPENCL}")
 message(STATUS "\tOpenGL: ${MNN_OPENGL}")
 message(STATUS "\tVulkan: ${MNN_VULKAN}")
 message(STATUS "\tARM82: ${MNN_ARM82}")
 message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
 message(STATUS "\tCUDA: ${MNN_CUDA}")
 message(STATUS "\tOpenMP: ${MNN_OPENMP}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")
-if(WIN32)
+if(MSVC)
    if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
      message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
    endif()
@ -178,14 +180,14 @@ if(WIN32)
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if (MNN_BUILD_SHARED_LIBS)
+        if (MNN_WIN_RUNTIME_MT)
            if(${flag_var} MATCHES "/MT")
                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
            endif()
        else ()
            if(${flag_var} MATCHES "/MD")
                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
            endif()
        else ()
            if(${flag_var} MATCHES "/MT")
                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
            endif()
        endif ()
    endforeach()
 elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -270,6 +272,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "^Linux")
 endif()
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/
                    ${CMAKE_CURRENT_LIST_DIR}/source/
                    ${CMAKE_CURRENT_LIST_DIR}/express/
                    ${CMAKE_CURRENT_LIST_DIR}/tools/
                    ${CMAKE_CURRENT_LIST_DIR}/schema/current/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/flatbuffers/include
@ -293,13 +297,13 @@ FILE(GLOB MNN_CV_SRC ${CMAKE_CURRENT_LIST_DIR}/source/cv/*)
 add_library(MNNCV OBJECT ${MNN_CV_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCV>)
 list(APPEND MNN_TARGETS MNNCV)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
+if (MNN_USE_SSE)
-    if(WIN32 OR MSVC)
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
-        target_compile_options(MNNCV PRIVATE /arch:AVX)
+        if (NOT MSVC)
    else()
            target_compile_options(MNNCV PRIVATE -msse3)
            target_compile_options(MNNCV PRIVATE -mavx)
        endif()
    endif()
 endif()
 # Math
@ -308,11 +312,19 @@ add_library(MNNMath OBJECT ${MNN_Math_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMath>)
 list(APPEND MNN_TARGETS MNNMath)
-# Shape
+# Transform
-FILE(GLOB MNN_Shape_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/*)
+FILE(GLOB MNN_Transform_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/* ${CMAKE_CURRENT_LIST_DIR}/source/geometry/*)
-add_library(MNNShape OBJECT ${MNN_Shape_SRC})
+add_library(MNNTransform OBJECT ${MNN_Transform_SRC})
-list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNShape>)
+IF (NOT MNN_BUILD_MINI)
-list(APPEND MNN_TARGETS MNNShape)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTransform>)
 ENDIF()
 list(APPEND MNN_TARGETS MNNTransform)
 # Utils
 FILE(GLOB MNN_Utils_SRC ${CMAKE_CURRENT_LIST_DIR}/source/utils/*)
 add_library(MNNUtils OBJECT ${MNN_Utils_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNUtils>)
 list(APPEND MNN_TARGETS MNNUtils)
 # Compute
 FILE(GLOB MNN_Compute_SRC ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/compute/*)
@ -327,7 +339,9 @@ list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCPU>)
 list(APPEND MNN_TARGETS MNNCPU)
 # X86_64 AVX/SSE
 if (MNN_USE_SSE)
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/x86_x64/CMakeLists.txt)
 endif()
 # AArch32/64 Assemblies
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/arm/CMakeLists.txt)
@ -377,7 +391,7 @@ if (NOT APPLE)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-      if (WIN32)
+      if (MSVC)
          set(OpenMP_C_FLAGS "/openmp ${OpenMP_C_FLAGS}")
          set(OpenMP_CXX_FLAGS "/openmp ${OpenMP_CXX_FLAGS}")
      endif()
@ -387,20 +401,22 @@ endif()
 set(CMAKE_CXX_FLAGS_ORIGIN ${CMAKE_CXX_FLAGS})
 set(CMAKE_C_FLAGS_ORIGIN ${CMAKE_C_FLAGS})
-if ((NOT (MSVC OR WIN32)) AND MNN_HIDDEN)
+if ((NOT MSVC) AND MNN_HIDDEN)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility-inlines-hidden -fvisibility=hidden")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
-    if (NOT APPLE)
+    # Omit frame pointer may cause difficult debug
    if ((NOT APPLE) AND (NOT WIN32))
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer")
    endif()
 endif()
-if (NOT (MSVC OR WIN32))
+if (NOT MSVC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
 endif()
 # Metal
-include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
+set(MNN_DEPS "")
 set(MNN_EXTRA_DEPENDS "")
 list(APPEND MNN_DEPS MNN)
 # Plugin
@ -409,6 +425,14 @@ if(MNN_WITH_PLUGIN)
    include(${CMAKE_CURRENT_LIST_DIR}/source/plugin/CMakeLists.txt)
 endif()
 # Metal
 if(MNN_METAL AND APPLE)
    add_definitions(-DMNN_METAL_ENABLED=1)
    include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
    list(APPEND MNN_TARGETS MNNMetal)
    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMetal>)
 endif()
 # Vulkan
 IF(MNN_VULKAN)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/vulkan/)
@ -446,22 +470,34 @@ IF(MNN_OPENGL)
  ENDIF()
 ENDIF()
 # CUDA
 IF(MNN_CUDA)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/cuda/)
  list(APPEND MNN_TARGETS MNN_CUDA)
  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_CUDA>)
  list(APPEND MNN_EXTRA_DEPENDS ${MNN_CUDA_LIBS})
 ENDIF()
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64")
 # ARM82 Assemblies
  IF(MNN_ARM82)
    add_definitions(-DENABLE_ARMV82)
    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/arm82/)
    IF(MNN_SEP_BUILD)
      list(APPEND MNN_DEPS MNN_Arm82)
    ELSE()
    list(APPEND MNN_TARGETS MNN_Arm82)
    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
  ENDIF()
  ENDIF()
 ENDIF()
 # Express
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/express/)
 # TensorRT
 IF(MNN_TENSORRT)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/tensorrt/)
  list(APPEND MNN_TARGETS MNN_TRT)
  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_TRT>)
  list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
 ENDIF()
 IF(MNN_SEP_BUILD)
  add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
@ -471,7 +507,7 @@ ELSE()
  list(APPEND MNN_TARGETS MNNExpress)
  IF(MNN_BUILD_SHARED_LIBS)
    add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
-    if (MSVC OR WIN32)
+    if (WIN32)
      foreach(TARGET ${MNN_TARGETS})
        target_compile_definitions(${TARGET} PRIVATE "-DBUILDING_MNN_DLL")
        target_compile_definitions(${TARGET} INTERFACE "-DUSING_MNN_DLL")
@ -484,7 +520,7 @@ ELSE()
  ENDIF()
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
 ENDIF()
-if (MSVC OR WIN32)
+if (MSVC)
  target_link_options(MNN PRIVATE "/IGNORE:4049,4217")
 endif()
@ -504,9 +540,11 @@ if(APPLE)
      target_link_libraries(MNN PUBLIC ${FOUNDATION})
      find_library(METAL Metal REQUIRED)
      target_link_libraries(MNN PUBLIC ${METAL})
      find_library(GRAPHIC CoreGraphics)
      target_link_libraries(MNN PUBLIC ${GRAPHIC})
    ENDIF()
 endif()
-add_dependencies(MNN MNNCore MNNCV MNNShape MNNMath MNNCompute MNNCPU GenVCSHDR)
+add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCompute MNNCPU GenVCSHDR)
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)
 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -532,12 +570,6 @@ if (NOT MNN_BUILD_SHARED_LIBS)
    endif()
 endif()
 list(APPEND MNN_TARGETS MNN)
  FOREACH(TARGET ${MNN_TARGETS})
    IF((NOT MSVC) AND (NOT WIN32))
    else()
      target_compile_definitions(${TARGET} PRIVATE _CRT_SECURE_NO_WARNINGS)
    endif()
  ENDFOREACH()
 list(REMOVE_ITEM MNN_TARGETS MNN)
 IF(MNN_BUILD_DEMO)
 include(${CMAKE_CURRENT_LIST_DIR}/demo/exec/CMakeLists.txt)
--- a/MNN.podspec
+++ b/MNN.podspec
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
  'schema/current/*.{h}',\
  '3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
  'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
  'source/shape/*.{h,c,m,mm,cc,hpp,cpp}',\
--- a/README_CN.md
+++ b/README_CN.md
@ -66,7 +66,7 @@ Interpreter由Engine和Backends构成。前者负责模型的加载、计算图
 三群：
-<img src="doc/DingTalkQR3.png" height="256"/>
+<img src="doc/DingTalkQR23.png" height="256"/>
 ## License
 Apache 2.0
--- a/backupcode/cpubackend/CPUBatchMatMul.cpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.cpp
@ -0,0 +1,89 @@
 //
 //  CPUBatchMatMul.cpp
 //  MNN
 //
 //  Created by MNN on 2019/03/25.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "backend/cpu/CPUBatchMatMul.hpp"
 #include "backend/cpu/CPUBackend.hpp"
 #include "math/Matrix.hpp"
 namespace MNN {
 CPUBatchMatMul::CPUBatchMatMul(Backend* backend, bool adjX, bool adjY) : Execution(backend) {
    mMatMul.reset(new CPUMatMul(backend, adjX, adjY, true));
 }
 ErrorCode CPUBatchMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    auto input0          = inputs[0];
    auto input1          = inputs[1];
    auto output          = outputs[0];
    // Fill output by zero if one of inputs is empty.
    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
        return NO_ERROR;
    }
    auto dimensions = input0->dimensions();
    mMatrixA.reset(Tensor::createDevice<float>({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)}));
    mMatrixB.reset(Tensor::createDevice<float>({input1->length(input1->dimensions()-2), input1->length(input0->dimensions()-1)}));
    mMatrixC.reset(Tensor::createDevice<float>({output->length(output->dimensions()-2), output->length(output->dimensions()-1)}));
    mTempInputs = {mMatrixA.get(), mMatrixB.get()};
    mTempOutputs = {mMatrixC.get()};
    auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC);
    res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC);
    res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC);
    if (!res) {
        return OUT_OF_MEMORY;
    }
    int batch = 1;
    for (int i = 0; i < dimensions - 2; ++i) {
        batch *= input0->length(i);
    }
    mBatch = batch;
    auto code = mMatMul->onResize(mTempInputs, mTempOutputs);
    backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC);
    backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC);
    backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC);
    return code;
 }
 ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
    auto input0          = inputs[0];
    auto input1          = inputs[1];
    auto output          = outputs[0];
    // Fill output by zero if one of inputs is empty.
    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
        ::memset(output->host<float>(), 0, output->size());
        return NO_ERROR;
    }
    const int dimensions = input0->dimensions();
    MNN_ASSERT(dimensions >= 3);
    const int input0Stride = input0->length(dimensions - 1) * input0->length(dimensions - 2);
    const int input1Stride = input1->length(dimensions - 1) * input1->length(dimensions - 2);
    const int outputStride = output->length(dimensions - 1) * output->length(dimensions - 2);
    const auto input0Ptr   = input0->host<float>();
    const auto input1Ptr   = input1->host<float>();
    float* const outputPtr = output->host<float>();
    for (int i = 0; i < mBatch; ++i) {
        ::memcpy(mMatrixA->host<float>(), input0Ptr + i * input0Stride, input0Stride * sizeof(float));
        ::memcpy(mMatrixB->host<float>(), input1Ptr + i * input1Stride, input1Stride * sizeof(float));
        mMatMul->onExecute(mTempInputs, mTempOutputs);
        ::memcpy(outputPtr + i * outputStride, mMatrixC->host<float>(), outputStride * sizeof(float));
    }
    return NO_ERROR;
 }
 class CPUBatchMatMulCreator : public CPUBackend::Creator {
 public:
    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                const MNN::Op* op, Backend* backend) const override {
        return new CPUBatchMatMul(backend, op->main_as_BatchMatMulParam()->adjX(), op->main_as_BatchMatMulParam()->adjY());
    }
 };
 REGISTER_CPU_OP_CREATOR(CPUBatchMatMulCreator, OpType_BatchMatMul);
 } // namespace MNN
--- a/backupcode/cpubackend/CPUBatchMatMul.hpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.hpp
@ -0,0 +1,35 @@
 //
 //  CPUBatchMatMul.hpp
 //  MNN
 //
 //  Created by MNN on 2019/03/25.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef CPUBatchMatMul_hpp
 #define CPUBatchMatMul_hpp
 #include "backend/cpu/CPUMatMul.hpp"
 namespace MNN {
 class CPUBatchMatMul : public Execution {
 public:
    CPUBatchMatMul(Backend *backend, bool adjX, bool adjY);
    virtual ~CPUBatchMatMul() = default;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
    int mBatch;
    std::shared_ptr<Execution> mMatMul;
    std::vector<Tensor*> mTempInputs;
    std::vector<Tensor*> mTempOutputs;
    std::shared_ptr<Tensor> mMatrixA;
    std::shared_ptr<Tensor> mMatrixB;
    std::shared_ptr<Tensor> mMatrixC;
 };
 } // namespace MNN
 #endif /* CPUBatchMatMul_hpp */
--- a/backupcode/cpubackend/CPUConvolution3D.cpp
+++ b/backupcode/cpubackend/CPUConvolution3D.cpp
@ -18,7 +18,6 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/ConvolutionFloatFactory.h"
 #include "math/Vec4.hpp"
 #define MIN_CON_PLANESIZE 256
--- a/backupcode/cpubackend/CPUConvolution3D.hpp
+++ b/backupcode/cpubackend/CPUConvolution3D.hpp
--- a/backupcode/cpubackend/CPUCosineSimilarity.cpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.cpp
@ -10,7 +10,9 @@
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
-#include "math/Vec4.hpp"
+#include "math/Vec.hpp"
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
@ -39,12 +41,12 @@ ErrorCode CPUCosineSimilarity::onExecute(const std::vector<Tensor*>& inputs, con
            const auto x1ChannelPtr = x1DataBatchPtr + j;
            const auto x2ChannelPtr = x2DataBatchPtr + j;
-            Math::Vec4 innerProduct(.0f);
+            Vec4 innerProduct(.0f);
-            Math::Vec4 x1Square(.0f);
+            Vec4 x1Square(.0f);
-            Math::Vec4 x2Square(.0f);
+            Vec4 x2Square(.0f);
            for (int c = 0; c < channel; ++c) {
-                Math::Vec4 x1Data = Math::Vec4::load(x1ChannelPtr + c * channleStride);
+                Vec4 x1Data = Vec4::load(x1ChannelPtr + c * channleStride);
-                Math::Vec4 x2Data = Math::Vec4::load(x2ChannelPtr + c * channleStride);
+                Vec4 x2Data = Vec4::load(x2ChannelPtr + c * channleStride);
                auto x1Xx2        = x1Data * x2Data;
                innerProduct      = innerProduct + x1Xx2;
                x1Square          = x1Square + x1Data * x1Data;
--- a/backupcode/cpubackend/CPUCosineSimilarity.hpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.hpp
--- a/backupcode/cpubackend/CPUDilation2D.cpp
+++ b/backupcode/cpubackend/CPUDilation2D.cpp
@ -12,8 +12,8 @@
 #include "core/Concurrency.h"
 #include "core/Macro.h"
-#include "math/Vec4.hpp"
+#include "math/Vec.hpp"
-using MNN::Math::Vec4;
+using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
--- a/backupcode/cpubackend/CPUDilation2D.hpp
+++ b/backupcode/cpubackend/CPUDilation2D.hpp
--- a/backupcode/cpubackend/CPUElu.cpp
+++ b/backupcode/cpubackend/CPUElu.cpp
--- a/backupcode/cpubackend/CPUElu.hpp
+++ b/backupcode/cpubackend/CPUElu.hpp
--- a/backupcode/cpubackend/CPUInnerProduct.cpp
+++ b/backupcode/cpubackend/CPUInnerProduct.cpp
@ -21,7 +21,7 @@ public:
        auto parameter  = op->main_as_InnerProduct();
        int outputCount = parameter->outputCount();
        int srcCount    = parameter->weight()->size() / outputCount;
-        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4));
+        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4, 4));
        if (mWeight.get() == nullptr) {
            mValid = false;
            return;
--- a/backupcode/cpubackend/CPUInnerProduct.hpp
+++ b/backupcode/cpubackend/CPUInnerProduct.hpp
--- a/backupcode/cpubackend/CPULRN.cpp
+++ b/backupcode/cpubackend/CPULRN.cpp
--- a/backupcode/cpubackend/CPULRN.hpp
+++ b/backupcode/cpubackend/CPULRN.hpp
--- a/backupcode/cpubackend/CPULSTM.cpp
+++ b/backupcode/cpubackend/CPULSTM.cpp
@ -180,6 +180,14 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect
            ::memcpy(mBiasC->host<float>(), mLSTM->bias()->float32s()->data(), mBiasC->size());
            ::memcpy(mWeightH->host<float>(), mLSTM->weightH()->float32s()->data(), mWeightH->size());
        }
        if (mGateHaveBias) {
            // Merge bias
            auto biasPtr = mBiasC->host<float>();
            auto biasPtr2 = biasPtr + 4 * numUnits;
            for (int i=0; i<4*numUnits; ++i) {
                biasPtr[i] = biasPtr[i] + biasPtr2[i];
            }
        }
    }
    if (inputs.size() > 1) {
@ -260,16 +268,8 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
    MNN_CONCURRENCY_END();
    float* biasStartPtr = mBiasC->host<float>();
    if(!mGateHaveBias){
        biasStartPtr = nullptr;
    }
    mRetriveOutputFunction(mGates.host<float>(), biasStartPtr);
    float* recurrenceBiasStartPtr = mBiasC->host<float>();
    if(mGateHaveBias){
        recurrenceBiasStartPtr += 4 * numUnits;
    }
    // tranform
    const float *contData = nullptr;
    if (inputs.size() > 1) {
@ -330,14 +330,11 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
                    }
                    // add bias
-                    auto biasPtr = recurrenceBiasStartPtr + oc;
+                    //MNN_PRINT("%f, %f, %f, %f\n", I, O, F, G);
-                    I            = sigmoid(*biasPtr + I);
+                    I            = sigmoid(I);
-                    biasPtr      = biasPtr + numUnits;
+                    F            = sigmoid(F);
-                    F            = sigmoid(*biasPtr + F);
+                    O            = sigmoid(O);
-                    biasPtr      = biasPtr + numUnits;
+                    G            = tanhf(G);
                    O            = sigmoid(*biasPtr + O);
                    biasPtr      = biasPtr + numUnits;
                    G            = tanhf(*biasPtr + G);
                    auto newCell   = F * cellData[oc] + I * G;
                    cellData[oc]   = newCell;
--- a/backupcode/cpubackend/CPULSTM.hpp
+++ b/backupcode/cpubackend/CPULSTM.hpp
--- a/backupcode/cpubackend/CPUNormalize.cpp
+++ b/backupcode/cpubackend/CPUNormalize.cpp
--- a/backupcode/cpubackend/CPUNormalize.hpp
+++ b/backupcode/cpubackend/CPUNormalize.hpp
--- a/backupcode/cpubackend/CPUSelu.cpp
+++ b/backupcode/cpubackend/CPUSelu.cpp
--- a/backupcode/cpubackend/CPUSelu.hpp
+++ b/backupcode/cpubackend/CPUSelu.hpp
--- a/backupcode/cpubackend/CPUSoftmax.cpp
+++ b/backupcode/cpubackend/CPUSoftmax.cpp
@ -0,0 +1,311 @@
 //
 //  CPUSoftmax.cpp
 //  MNN
 //
 //  Created by MNN on 2018/07/16.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "backend/cpu/CPUSoftmax.hpp"
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "core/Concurrency.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
 #ifdef MNN_USE_NEON
 #include <arm_neon.h>
 #endif
 namespace MNN {
 int CPUSoftmax::_softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum) {
    // Max and sub
    MNN_CONCURRENCY_BEGIN(tId, threadNum)
    {
        const float *srcY = srcData + tId * channel;
        float *dstY       = dstData + tId * channel;
        for (int y = (int)tId; y < outside; y += threadNum, srcY += channel * threadNum, dstY += channel * threadNum) {
            float maxValue = srcY[0];
            {
                int c = 1;
 #ifdef MNN_USE_NEON
 #if !(defined(__ARM_FEATURE_FMA) && defined(__aarch64__))
 #define vmaxvq_f32(v)                 \
    ({                                \
        float __m = v[0];             \
        for (int i = 1; i < 4; i++) { \
            if (v[i] > __m)           \
                __m = v[i];           \
        }                             \
        __m;                          \
    })
 #endif
                if (c + 3 < channel) {
                    float32x4_t maxx4 = vld1q_f32(srcY + c);
                    c += 4;
                    for (; c + 3 < channel; c += 4) {
                        maxx4 = vmaxq_f32(maxx4, vld1q_f32(srcY + c));
                    }
                    float value = vmaxvq_f32(maxx4);
                    if (value > maxValue)
                        maxValue = value;
                }
 #endif
                for (; c < channel; ++c) {
                    float value = srcY[c];
                    if (value > maxValue)
                        maxValue = value;
                }
            }
            for (int c = 0; c < channel; ++c) {
                dstY[c] = -srcY[c] + maxValue;
            }
        }
    }
    MNN_CONCURRENCY_END();
    //Exp
    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(channel * outside);
    int sizeDivide = schedule.first;
    int scheduleNumber = schedule.second;
    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
        int start = sizeDivide * (int)tId;
        int realSize = sizeDivide;
        if (tId == scheduleNumber -1 ) {
            realSize = channel * outside - start;
        }
        if (realSize > 0) {
            MNNExp(dstData + start, dstData + start, realSize);
        }
    }
    MNN_CONCURRENCY_END();
    // Sum and div
    MNN_CONCURRENCY_BEGIN(tId, threadNum);
    {
        float *dstY       = dstData + tId * channel;
        for (int y = (int)tId; y < outside; y += threadNum, dstY += channel * threadNum) {
            // sum
            float sumValue = 0;
            for (int c = 0; c < channel; ++c) {
                sumValue += dstY[c];
            }
            // div
            {
                int c = 0;
 #ifdef MNN_USE_NEON
                float div = 1.f / sumValue;
                for (; c + 3 < channel; c += 4) {
                    vst1q_f32(dstY + c, vmulq_n_f32(vld1q_f32(dstY + c), div));
                }
 #endif
                for (; c < channel; ++c) {
                    dstY[c] /= sumValue;
                }
            }
        }
    }
    MNN_CONCURRENCY_END();
    return 0;
 }
 int CPUSoftmax::_softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel,
                               float *maxValue, float *sumValue, int threadNum) {
    if (inside == 1)
        return _softmax1(srcData, dstData, outside, channel, threadNum);
    const int stepY = inside * channel;
    MNN_CONCURRENCY_BEGIN(tId, threadNum);
    {
        const float *srcY  = srcData + tId * stepY;
        float *dstY        = dstData + tId * stepY;
        float *maxValueSub = maxValue + tId * inside;
        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
            memcpy(maxValueSub, srcY, sizeof(float) * inside);
            const float *src = srcY + inside;
            for (int c = 1; c < channel; ++c, src += inside) {
                for (int x = 0; x < inside; ++x) {
                    if (src[x] > maxValueSub[x])
                        maxValueSub[x] = src[x];
                }
            }
            src        = srcY;
            float *dst = dstY;
            for (int c = 0; c < channel; ++c, src += inside, dst += inside) {
                for (int x = 0; x < inside; ++x) {
                    dst[x] = -src[x] + maxValueSub[x];
                }
            }
        }
    }
    MNN_CONCURRENCY_END();
    auto totalSize = channel * inside * outside;
    //Exp
    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(totalSize);
    int sizeDivide = schedule.first;
    int scheduleNumber = schedule.second;
    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
        int start = sizeDivide * (int)tId;
        int realSize = sizeDivide;
        if (tId == scheduleNumber -1 ) {
            realSize = totalSize - start;
        }
        if (realSize > 0) {
            MNNExp(dstData + start, dstData + start, realSize);
        }
    }
    MNN_CONCURRENCY_END();
    MNN_CONCURRENCY_BEGIN(tId, threadNum);
    {
        const float *srcY  = srcData + tId * stepY;
        float *dstY        = dstData + tId * stepY;
        float *sumValueSub = sumValue + tId * inside;
        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
            memset(sumValueSub, 0, sizeof(float) * inside);
            float *dst = dstY;
            for (int c = 0; c < channel; ++c, dst += inside) {
                for (int x = 0; x < inside; ++x) {
                    sumValueSub[x] += dst[x];
                }
            }
            dst = dstY;
            for (int c = 0; c < channel; ++c, dst += inside) {
                for (int x = 0; x < inside; ++x) {
                    dst[x] /= sumValueSub[x];
                }
            }
        }
    }
    MNN_CONCURRENCY_END();
    return 0;
 }
 ErrorCode CPUSoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    auto input           = inputs[0];
    const int dimensions = input->buffer().dimensions;
    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
    if (mNeedUnpackC4) {
        int totalSize = 1;
        for (int i = 1; i < dimensions; ++i) {
            totalSize *= input->length(i);
        }
        mStorage.buffer().dim[0].extent = input->length(0);
        mStorage.buffer().dim[1].extent = totalSize;
        TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
        mStorage.buffer().dimensions    = 2;
        mStorage.buffer().type          = input->getType();
        backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
    }
    int inside = 1;
    int dims   = input->buffer().dimensions;
    for (int i = mAxis + 1; i < dims; ++i) {
        inside *= input->length(i);
    }
    if (inside != 1) { // not run _softmax1, we need maxValue Tensor and sumValue Tensor.
        int threadNum = ((CPUBackend *)backend())->threadNumber();
        mMaxValue.buffer().dim[0].extent = inside * threadNum;
        mMaxValue.buffer().dimensions    = 1;
        mMaxValue.setType(DataType_DT_FLOAT);
        backend()->onAcquireBuffer(&mMaxValue, Backend::DYNAMIC);
        mSumValue.buffer().dim[0].extent = inside * threadNum;
        mSumValue.buffer().dimensions    = 1;
        mSumValue.setType(DataType_DT_FLOAT);
        backend()->onAcquireBuffer(&mSumValue, Backend::DYNAMIC);
        backend()->onReleaseBuffer(&mMaxValue, Backend::DYNAMIC);
        backend()->onReleaseBuffer(&mSumValue, Backend::DYNAMIC);
    }
    if (mNeedUnpackC4) {
        backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
    }
    return NO_ERROR;
 }
 ErrorCode CPUSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
    MNN_ASSERT(1 == inputs.size());
    MNN_ASSERT(1 == outputs.size());
    auto inputTensor        = inputs[0];
    auto outputTensor       = outputs[0];
    const auto inputDataPtr = inputTensor->host<float>();
    auto outputDataPtr      = outputTensor->host<float>();
    const int batch         = inputTensor->batch();
    const auto dims         = inputTensor->buffer().dimensions;
    float *tempData = nullptr;
    if (mNeedUnpackC4) {
        tempData = mStorage.host<float>();
    }
    int areaInput = 1;
    for (int i = 2; i < dims; ++i) {
        areaInput *= inputTensor->length(i);
    }
    int inside  = 1;
    int outside = 1;
    int channel = 1;
    for (int i = 0; i < mAxis; ++i) {
        outside *= inputTensor->length(i);
    }
    channel = inputTensor->length(mAxis);
    for (int i = mAxis + 1; i < dims; ++i) {
        inside *= inputTensor->length(i);
    }
    int threadNum = ((CPUBackend *)backend())->threadNumber();
    if (!mNeedUnpackC4) {
        _softmaxCommon(inputDataPtr, outputDataPtr, inside, outside, channel, mMaxValue.host<float>(),
                   mSumValue.host<float>(), threadNum);
        return NO_ERROR;
    }
    auto outputSize = outputTensor->elementSize();
    int batchSize = outputSize / batch;
    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
        auto inputData  = inputDataPtr + batchIndex * batchSize;
        MNNUnpackC4(outputDataPtr + batchIndex * mStorage.length(1), inputData, areaInput, inputTensor->channel());
    }
    _softmaxCommon(outputDataPtr, tempData, inside, outside, channel, mMaxValue.host<float>(), mSumValue.host<float>(), threadNum);
    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
        auto outputData = outputDataPtr + batchIndex * batchSize;
        auto tempPtr = tempData + batchIndex * mStorage.length(1);
        MNNPackC4(outputData, tempPtr, areaInput, outputTensor->channel());
    }
    return NO_ERROR;
 }
 CPUSoftmax::CPUSoftmax(Backend *b, int axis) : MNN::Execution(b), mAxis(axis), mStorage(2), mNeedUnpackC4(false) {
    // nothing to do
 }
 class CPUSoftmaxCreator : public CPUBackend::Creator {
 public:
    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                const MNN::Op *op, Backend *backend) const override {
        auto axis = op->main_as_Axis()->axis();
        if (axis < 0) {
            axis = inputs[0]->dimensions() + axis;
        }
        return new CPUSoftmax(backend, axis);
    }
 };
 REGISTER_CPU_OP_CREATOR(CPUSoftmaxCreator, OpType_Softmax);
 } // namespace MNN
--- a/backupcode/cpubackend/CPUSoftmax.hpp
+++ b/backupcode/cpubackend/CPUSoftmax.hpp
@ -0,0 +1,35 @@
 //
 //  CPUSoftmax.hpp
 //  MNN
 //
 //  Created by MNN on 2018/07/16.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef CPUSoftmax_hpp
 #define CPUSoftmax_hpp
 #include "core/Execution.hpp"
 namespace MNN {
 class CPUSoftmax : public Execution {
 public:
    CPUSoftmax(Backend *b, int axis);
    virtual ~CPUSoftmax() = default;
    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 private:
    int _softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel, float *maxValue,
                       float *sumValue, int threadNum);
    int _softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum);
    int mAxis;
    Tensor mStorage;
    Tensor mMaxValue;
    Tensor mSumValue;
    bool mNeedUnpackC4;
 };
 } // namespace MNN
 #endif /* CPUSoftmax_hpp */
--- a/backupcode/cpubackend/CPUSpatialProduct.cpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.cpp
--- a/backupcode/cpubackend/CPUSpatialProduct.hpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.hpp
--- a/backupcode/cpubackend/CPUThreshold.cpp
+++ b/backupcode/cpubackend/CPUThreshold.cpp
--- a/backupcode/cpubackend/CPUThreshold.hpp
+++ b/backupcode/cpubackend/CPUThreshold.hpp
--- a/source/backend/cpu/compute/Convolution3D3x3.cpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.cpp
@ -13,10 +13,8 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
-#include "math/Vec4.hpp"
+#include "math/Vec.hpp"
-using namespace MNN::Math;
+using Vec4 = MNN::Math::Vec<float, 4>;
 typedef Vec4 float4;
 #define SOURCE_BLOCK 64
 #define WEIGHT_BLOCK 256
--- a/source/backend/cpu/compute/Convolution3D3x3.hpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.hpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
--- a/backupcode/geometry/GeometryCropAndResize.cpp
+++ b/backupcode/geometry/GeometryCropAndResize.cpp
@ -0,0 +1,128 @@
 //
 //  GeometryCropAndResize.cpp
 //  MNN
 //
 //  Created by MNN on 2020/08/5.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "geometry/GeometryComputer.hpp"
 #include "core/OpCommonUtils.hpp"
 #include "geometry/GeometryComputerUtils.hpp"
 #include "ConvertUtils.hpp"
 namespace MNN {
 class GeometryCropAndResize : public GeometryComputer {
 public:
    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
        MNN_ASSERT(4 == inputs.size());
        MNN_ASSERT(1 == outputs.size());
        auto img       = inputs[0];
        auto boxes     = inputs[1];
        auto box_ind   = inputs[2];
        auto crop_size = inputs[3];
        auto output    = outputs[0];
        auto extrapolation = op->main_as_CropAndResize()->extrapolationValue();
        auto method = op->main_as_CropAndResize()->method();
        // resizeType of Interp : 1-NEAREST, 2-BILINEAR
        const int resizeType = method == CropAndResizeMethod_BILINEAR ? 2 : 1;
        int batch = img->length(0), ih = img->length(1), iw = img->length(2),
                  depth = img->length(3), boxNum = boxes->length(0);
        const int cropHeight = crop_size->host<uint32_t>()[0],
                  cropWidth = crop_size->host<uint32_t>()[1];
        auto des             = TensorUtils::getDescribe(output);
        des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
        des->regions.clear();
        des->regions.reserve(boxNum);
        for (int i = 0; i < boxNum; i++) {
            const float y1 = boxes->host<float>()[i*4];
            const float x1 = boxes->host<float>()[i*4+1];
            const float y2 = boxes->host<float>()[i*4+2];
            const float x2 = boxes->host<float>()[i*4+3];
            const int ind = box_ind->host<uint32_t>()[i];
            const float ch = (y2 - y1) * (ih - 1), cw = (x2 - x1) * (iw - 1);
            const float yScale = ch / static_cast<float>(cropHeight - 1);
            const float xScale = cw / static_cast<float>(cropWidth - 1);
            const float yOffset = y1 * (ih - 1), xOffset = x1 * (iw - 1);
            // select croped image from images, convert it's format from NHWC to NC4HW4
            std::shared_ptr<Tensor> cropValue(new Tensor);
            {
                cropValue->buffer().type = halide_type_of<float>();
                cropValue->buffer().dimensions = 4;
                cropValue->setLength(0, 1);
                cropValue->setLength(1, depth);
                cropValue->setLength(2, ih);
                cropValue->setLength(3, iw);
                auto des             = TensorUtils::getDescribe(cropValue.get());
                des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
                des->regions.clear();
                Tensor::InsideDescribe::Region region;
                region.origin        = img;
                region.size[1]       = depth;
                region.size[2]       = ih * iw;
                region.src.offset    = ind * ih * iw * depth;
                region.dst.offset    = 0;
                region.src.stride[1] = 1;
                region.src.stride[2] = depth;
                region.dst.stride[1] = ih * iw;
                region.dst.stride[2] = 1;
                des->regions.emplace_back(std::move(region));
                res.extras.emplace_back(cropValue);
            }
            // using Interp Op deal with crop and resize for selected image
            std::shared_ptr<Tensor> resizeValue;
            {
                resizeValue.reset(Tensor::createDevice<float>({1, depth, cropHeight, cropWidth}));
                auto des             = TensorUtils::getDescribe(resizeValue.get());
                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
                std::unique_ptr<OpT> interp(new OpT);
                interp->type                          = OpType_Interp;
                interp->main.type                     = OpParameter_Interp;
                interp->main.value                    = new InterpT;
                interp->main.AsInterp()->widthScale   = xScale;
                interp->main.AsInterp()->heightScale  = yScale;
                interp->main.AsInterp()->widthOffset  = xOffset;
                interp->main.AsInterp()->heightOffset = yOffset;
                interp->main.AsInterp()->alignCorners = false;
                interp->main.AsInterp()->resizeType   = resizeType;
                auto cmd = GeometryComputerUtils::makeCommand(interp.get(), {cropValue.get()}, {resizeValue.get()});
                res.extras.emplace_back(resizeValue);
                res.command.emplace_back(cmd);
            }
            // convert resize image's format from NC4HW4 to NHWC, add it to output's batch
            {
                Tensor::InsideDescribe::Region region;
                region.origin        = resizeValue.get();
                region.size[1]       = cropHeight * cropWidth;
                region.size[2]       = depth;
                region.src.offset    = 0;
                region.dst.offset    = i * cropHeight * cropWidth * depth;
                region.src.stride[1] = 1;
                region.src.stride[2] = cropHeight * cropWidth;
                region.dst.stride[1] = depth;
                region.dst.stride[2] = 1;
                des->regions.emplace_back(std::move(region));
            }
        }
        return true;
    }
    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
                                                 const std::vector<Tensor*>& outputs) const override {
        //return {false};
        return {true};
    }
 };
 static void _create() {
    std::shared_ptr<GeometryComputer> comp(new GeometryCropAndResize);
    // GeometryComputer::registerGeometryComputer(comp, {OpType_CropAndResize});
 }
 REGISTER_GEOMETRY(GeometryCropAndResize, _create);
 } // namespace MNN
--- a/backupcode/geometry/GeometryGather.cpp
+++ b/backupcode/geometry/GeometryGather.cpp
@ -0,0 +1,304 @@
 //
 //  GeometryGather.cpp
 //  MNN
 //
 //  Created by MNN on 2020/06/09.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "geometry/GeometryComputer.hpp"
 #include "core/OpCommonUtils.hpp"
 namespace MNN {
 class GeometryGather : public DefaultGeometryComputer {
 public:
    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
                                                 const std::vector<Tensor*>& outputs) const override {
        MNN_ASSERT(inputs.size() == 2);
        MNN_ASSERT(1 == outputs.size());
        auto embedding = inputs[0];
        auto indices   = inputs[1];
        auto output    = outputs[0];
        const int firstDimStride = embedding->buffer().dim[0].stride;
        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && firstDimStride != 0) {
            std::vector<bool> res(outputs.size(), true);
            return res;
        }
        return std::vector<bool>(outputs.size(), false);
    }
    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                           Context& context, CommandBuffer& res) const override {
        MNN_ASSERT(2 == inputs.size());
        MNN_ASSERT(1 == outputs.size());
        auto embedding = inputs[0];
        auto indices   = inputs[1];
        auto output    = outputs[0];
        const int firstDimStride = embedding->buffer().dim[0].stride;
        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || firstDimStride == 0) {
            Command cmd;
            cmd.op      = op;
            cmd.inputs  = std::move(inputs);
            cmd.outputs = std::move(outputs);
            res.command.emplace_back(std::move(cmd));
            return true;
        }
        auto bytes = embedding->buffer().type.bytes();
        const size_t indicesCount = indices->elementSize();
        const auto limit          = embedding->length(0);
        const int* indicesData    = indices->host<int32_t>();
        auto outputDes = TensorUtils::getDescribe(output);
        outputDes->regions.clear();
        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        for (int i = 0; i < indicesCount; i++) {
            if (indicesData[i] < 0 || indicesData[i] > limit) {
                MNN_PRINT("Gather indice error\n");
                return false;
            }
            Tensor::InsideDescribe::Region slice;
            slice.origin        = embedding;
            slice.size[0]       = 1;
            slice.size[1]       = 1;
            slice.size[2]       = firstDimStride;
            slice.src.offset    = firstDimStride * indicesData[i];
            slice.dst.offset    = i * firstDimStride;
            slice.src.stride[0] = 1;
            slice.src.stride[1] = 1;
            slice.src.stride[2] = 1;
            slice.dst.stride[0] = 1;
            slice.dst.stride[1] = 1;
            slice.dst.stride[2] = 1;
            outputDes->regions.emplace_back(std::move(slice));
        }
        return true;
    }
 };
 class GeometryGatherND : public DefaultGeometryComputer {
 public:
    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
                                                 const std::vector<Tensor*>& outputs) const override {
        MNN_ASSERT(inputs.size() == 2);
        MNN_ASSERT(1 == outputs.size());
        auto params  = inputs[0];
        auto indices = inputs[1];
        auto output  = outputs[0];
        int mSliceN    = 1;
        int mSliceSize = 1;
        for (int i = 0; i < indices->dimensions() - 1; ++i) {
            mSliceN *= indices->length(i);
        }
        auto indiceNd = indices->length(indices->dimensions() - 1);
        std::vector<int> mDimsToCount;
        mDimsToCount.resize(indiceNd);
        for (int i = indiceNd; i < params->dimensions(); ++i) {
            mSliceSize *= params->length(i);
        }
        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && mSliceSize != 0) {
            std::vector<bool> res(outputs.size(), true);
            return res;
        } else {
            std::vector<bool> res(outputs.size(), false);
            return res;
        }
    }
    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                           Context& context, CommandBuffer& res) const override {
        MNN_ASSERT(2 == inputs.size());
        MNN_ASSERT(1 == outputs.size());
        auto params = inputs[0];
        auto indice = inputs[1];
        auto output = outputs[0];
        int mSliceN    = 1;
        int mSliceSize = 1;
        for (int i = 0; i < indice->dimensions() - 1; ++i) {
            mSliceN *= indice->length(i);
        }
        auto indiceNd = indice->length(indice->dimensions() - 1);
        std::vector<int> mDimsToCount;
        mDimsToCount.resize(indiceNd);
        for (int i = indiceNd; i < params->dimensions(); ++i) {
            mSliceSize *= params->length(i);
        }
        if (TensorUtils::getDescribe(indice)->usage != MNN::Tensor::InsideDescribe::CONSTANT || mSliceSize == 0) {
            Command cmd;
            cmd.op      = op;
            cmd.inputs  = std::move(inputs);
            cmd.outputs = std::move(outputs);
            res.command.emplace_back(std::move(cmd));
            return true;
        }
        auto paramSize = params->elementSize();
        for (int i = 0; i < indiceNd; ++i) {
            mDimsToCount[i] = paramSize / params->length(i);
            paramSize       = mDimsToCount[i];
        }
        mDimsToCount.resize(indiceNd);
        auto indiceData = indice->host<int32_t>();
        auto outputDes = TensorUtils::getDescribe(output);
        outputDes->regions.clear();
        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        for (int i = 0; i < mSliceN; i++) {
            int fromPos = 0;
            for (int j = 0; j < indiceNd; ++j) {
                fromPos += mDimsToCount[j] * indiceData[i * indiceNd + j];
            }
            Tensor::InsideDescribe::Region slice;
            slice.origin        = params;
            slice.size[0]       = 1;
            slice.size[1]       = 1;
            slice.size[2]       = mSliceSize;
            slice.src.offset    = fromPos;
            slice.dst.offset    = i * mSliceSize;
            slice.src.stride[0] = 1;
            slice.src.stride[1] = 1;
            slice.src.stride[2] = 1;
            slice.dst.stride[0] = 1;
            slice.dst.stride[1] = 1;
            slice.dst.stride[2] = 1;
            outputDes->regions.emplace_back(std::move(slice));
        }
        return true;
    }
 };
 class GeometryGatherV2 : public DefaultGeometryComputer {
 public:
    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
                                                 const std::vector<Tensor*>& outputs) const override {
        MNN_ASSERT(inputs.size() >= 2);
        MNN_ASSERT(1 == outputs.size());
        auto params  = inputs[0];
        auto indices = inputs[1];
        auto output  = outputs[0];
        int axis = 0;
        if (inputs.size() == 3) {
            const Tensor* axisTensor = inputs[2];
            axis                     = axisTensor->host<int32_t>()[0];
        }
        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
        if (axis < 0) {
            axis = params->buffer().dimensions + axis;
        }
        const int gatherDimSize = params->buffer().dim[axis].extent;
        const int N             = indices->elementSize();
        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
        int inside = 1;
        for (int i = axis + 1; i < params->dimensions(); ++i) {
            inside *= params->length(i);
        }
        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && inside != 0) {
            std::vector<bool> res(outputs.size(), true);
            return res;
        }
        return std::vector<bool>(outputs.size(), false);
    }
    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                           Context& context, CommandBuffer& res) const override {
        MNN_ASSERT(inputs.size() >= 2);
        MNN_ASSERT(1 == outputs.size());
        auto params  = inputs[0];
        auto indices = inputs[1];
        auto output  = outputs[0];
        int axis = 0;
        if (inputs.size() == 3) {
            const Tensor* axisTensor = inputs[2];
            axis                     = axisTensor->host<int32_t>()[0];
        }
        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
        if (axis < 0) {
            axis = params->buffer().dimensions + axis;
        }
        const int gatherDimSize = params->buffer().dim[axis].extent;
        const int N             = indices->elementSize();
        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
        int inside  = 1;
        int outside = 1;
        for (int i = 0; i < axis; ++i) {
            outside *= params->length(i);
        }
        for (int i = axis + 1; i < params->dimensions(); ++i) {
            inside *= params->length(i);
        }
        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || inside == 0) {
            Command cmd;
            cmd.op      = op;
            cmd.inputs  = std::move(inputs);
            cmd.outputs = std::move(outputs);
            res.command.emplace_back(std::move(cmd));
            return true;
        }
        const int limit               = params->length(axis);
        auto bytes                    = output->buffer().type.bytes();
        const int insideStride        = inside;
        const int outputOutsideStride = inside * N;
        const int inputOutsideStride  = inside * inputs[0]->length(axis);
        const int* indicesPtr         = indices->host<int32_t>();
        auto outputDes = TensorUtils::getDescribe(output);
        outputDes->regions.clear();
        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
        for (int o = 0; o < outside; ++o) {
            for (int i = 0; i < N; i++) {
                if (indicesPtr[i] < 0 || indicesPtr[i] > limit) {
                    continue;
                }
                Tensor::InsideDescribe::Region slice;
                slice.origin        = params;
                slice.size[0]       = 1;
                slice.size[1]       = 1;
                slice.size[2]       = insideStride;
                slice.src.offset    = inputOutsideStride * o + insideStride * indicesPtr[i];
                slice.dst.offset    = outputOutsideStride * o + i * insideStride;
                slice.src.stride[0] = 1;
                slice.src.stride[1] = 1;
                slice.src.stride[2] = 1;
                slice.dst.stride[0] = 1;
                slice.dst.stride[1] = 1;
                slice.dst.stride[2] = 1;
                outputDes->regions.emplace_back(std::move(slice));
            }
        }
        return true;
    }
 };
 static void _create() {
 //    std::shared_ptr<GeometryComputer> comp(new GeometryGather);
 //    GeometryComputer::registerGeometryComputer(comp, {OpType_Gather});
 //
 //    std::shared_ptr<GeometryComputer> comp2(new GeometryGatherND);
 //    GeometryComputer::registerGeometryComputer(comp2, {OpType_GatherND});
 //
 //    std::shared_ptr<GeometryComputer> comp3(new GeometryGatherV2);
 //    GeometryComputer::registerGeometryComputer(comp3, {OpType_GatherV2});
 }
 REGISTER_GEOMETRY(GeometryGather, _create);
 } // namespace MNN
--- a/backupcode/geometry/GeometrySoftmax.cpp
+++ b/backupcode/geometry/GeometrySoftmax.cpp
@ -0,0 +1,214 @@
 //
 //  GeometrySoftmax.cpp
 //  MNN
 //
 //  Created by MNN on 2020/06/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "geometry/GeometryComputer.hpp"
 #include "core/OpCommonUtils.hpp"
 #include "geometry/GeometryComputerUtils.hpp"
 namespace MNN {
 class GeometrySoftmax : public GeometryComputer {
 public:
    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
                                                 const std::vector<Tensor*>& outputs) const override {
        auto  axis = op->main_as_Axis()->axis();
        if (axis < 0) {
            axis = inputs[0]->dimensions() + axis;
        }
        if (axis == 1) {
            return std::vector<bool>(outputs.size(), false);
        }
        return std::vector<bool>(outputs.size(), true);
    }
    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs,
                                    const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
        MNN_ASSERT(1 == inputs.size());
        MNN_ASSERT(1 == outputs.size());
        auto input     = inputs[0];
        auto output    = outputs[0];
        auto dims      = input->buffer().dimensions;
        auto  axis = op->main_as_Axis()->axis();
        if (axis < 0) {
            axis = inputs[0]->dimensions() + axis;
        }
        if (axis == 1) {
            Command cmd;
            cmd.op      = op;
            cmd.inputs  = std::move(inputs);
            cmd.outputs = std::move(outputs);
            res.command.emplace_back(std::move(cmd));
            return true;
        }
        int inside  = 1;
        int outside = 1;
        int channel = 1;
        for (int i = 0; i < axis; ++i) {
            outside *= input->length(i);
        }
        channel = input->length(axis);
        for (int i = axis + 1; i < dims; ++i) {
            inside *= input->length(i);
        }
        //input transform to NCHW format
        std::shared_ptr<Tensor> tmpInput;
        {
            tmpInput.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto outputDes = TensorUtils::getDescribe(tmpInput.get());
            outputDes->regions.clear();
            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
            Tensor::InsideDescribe::Region desReg;
            desReg.size[0] = outside;
            desReg.size[1] = channel;
            desReg.size[2] = inside;
            desReg.dst.offset = 0;
            desReg.dst.stride[0] = channel*inside;
            desReg.dst.stride[1] = inside;
            desReg.dst.stride[2] = 1;
            desReg.src.offset = 0;
            desReg.src.stride[0] = channel*inside;
            desReg.src.stride[1] = inside;
            desReg.src.stride[2] = 1;
            desReg.origin = input;
            outputDes->regions.emplace_back(std::move(desReg));
            res.extras.emplace_back(tmpInput);
        }
        //reduction max, axis=1
        std::shared_ptr<Tensor> maxValue;
        {
            maxValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
            res.extras.emplace_back(maxValue);
            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_MAXIMUM, tmpInput.get(), maxValue.get()));
        }
        //broadcast reduction axis dim
        std::shared_ptr<Tensor> maxBroadValue;
        {
            maxBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto outputDes = TensorUtils::getDescribe(maxBroadValue.get());
            outputDes->regions.clear();
            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
            Tensor::InsideDescribe::Region desReg;
            desReg.size[0] = outside;
            desReg.size[1] = channel;
            desReg.size[2] = inside;
            desReg.dst.offset = 0;
            desReg.dst.stride[0] = channel*inside;
            desReg.dst.stride[1] = inside;
            desReg.dst.stride[2] = 1;
            desReg.src.offset = 0;
            desReg.src.stride[0] = inside;
            desReg.src.stride[1] = 0;
            desReg.src.stride[2] = 1;
            desReg.origin = maxValue.get();
            outputDes->regions.emplace_back(std::move(desReg));
            res.extras.emplace_back(maxBroadValue);
        }
        //sub
        std::shared_ptr<Tensor> subMaxValue;
        {
            subMaxValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_SUB, tmpInput.get(), maxBroadValue.get(), subMaxValue.get());
            res.extras.emplace_back(subMaxValue);
            res.command.emplace_back(std::move(cmd));
        }
        //exp
        std::shared_ptr<Tensor> expValue;
        {
            expValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_EXP, subMaxValue.get(), expValue.get());
            res.extras.emplace_back(expValue);
            res.command.emplace_back(std::move(cmd));
        }
        //reduction sum, axis=2, only support NCHW
        std::shared_ptr<Tensor> sumValue;
        {
            sumValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
            res.extras.emplace_back(sumValue);
            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_SUM, expValue.get(), sumValue.get()));
        }
        //broadcast reduction axis dim
        std::shared_ptr<Tensor> sumBroadValue;
        {
            sumBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto outputDes = TensorUtils::getDescribe(sumBroadValue.get());
            outputDes->regions.clear();
            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
            Tensor::InsideDescribe::Region desReg;
            desReg.size[0] = outside;
            desReg.size[1] = channel;
            desReg.size[2] = inside;
            desReg.dst.offset = 0;
            desReg.dst.stride[0] = channel*inside;
            desReg.dst.stride[1] = inside;
            desReg.dst.stride[2] = 1;
            desReg.src.offset = 0;
            desReg.src.stride[0] = inside;
            desReg.src.stride[1] = 0;
            desReg.src.stride[2] = 1;
            desReg.origin = sumValue.get();
            outputDes->regions.emplace_back(std::move(desReg));
            res.extras.emplace_back(sumBroadValue);
        }
        //div
        std::shared_ptr<Tensor> tmpOutput;
        {
            tmpOutput.reset(Tensor::createDevice<float>({outside, channel, inside}));
            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_REALDIV, expValue.get(), sumBroadValue.get(), tmpOutput.get());
            res.extras.emplace_back(tmpOutput);
            res.command.emplace_back(std::move(cmd));
        }
        //transform to output
        {
            auto outputDes = TensorUtils::getDescribe(output);
            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
            Tensor::InsideDescribe::Region desReg;
            desReg.size[0] = outside;
            desReg.size[1] = channel;
            desReg.size[2] = inside;
            desReg.dst.offset = 0;
            desReg.dst.stride[0] = channel*inside;
            desReg.dst.stride[1] = inside;
            desReg.dst.stride[2] = 1;
            desReg.src.offset = 0;
            desReg.src.stride[0] = channel*inside;
            desReg.src.stride[1] = inside;
            desReg.src.stride[2] = 1;
            desReg.origin = tmpOutput.get();
            outputDes->regions.emplace_back(std::move(desReg));
        }
        return true;
    }
 };
 static void _create() {
 //    std::shared_ptr<GeometryComputer> comp(new GeometrySoftmax);
 //    GeometryComputer::registerGeometryComputer(comp, {OpType_Softmax});
 }
 REGISTER_GEOMETRY(GeometrySoftmax, _create);
 } // namespace MNN
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@ -7,7 +7,7 @@ add_executable(benchmarkExprModels.out ${CMAKE_CURRENT_LIST_DIR}/benchmarkExprMo
 target_include_directories(benchmarkExprModels.out PRIVATE "${CMAKE_CURRENT_LIST_DIR}/exprModels" ${CMAKE_CURRENT_SOURCE_DIR}/)
 target_link_libraries(benchmarkExprModels.out ${MNN_DEPS})
-if ((MSVC OR WIN32) AND NOT MNN_BUILD_SHARED_LIBS)
+if (MSVC AND NOT MNN_BUILD_SHARED_LIBS)
  foreach (DEPEND ${MNN_DEPS})
    target_link_options(benchmark.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
    target_link_options(benchmarkExprModels.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@ -124,6 +124,7 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
    const auto bufferSize = revertor->getBufferSize();
    auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
    revertor.reset();
    net->setSessionMode(MNN::Interpreter::Session_Release);
    MNN::ScheduleConfig config;
    config.numThread = numberThread;
    config.type      = static_cast<MNNForwardType>(forward);
--- a/benchmark/benchmarkExprModels.cpp
+++ b/benchmark/benchmarkExprModels.cpp
@ -90,6 +90,7 @@ static std::vector<float> runNet(VARP netOutput, const ScheduleConfig& config, i
    const void* buf = builder.GetBufferPointer();
    size_t size = builder.GetSize();
    std::unique_ptr<Interpreter> net(Interpreter::createFromBuffer(buf, size));
    net->setSessionMode(MNN::Interpreter::Session_Release);
    auto session = net->createSession(config);
    net->releaseModel();
    auto inputTensor = net->getSessionInput(session, NULL);
--- a/benchmark/opencl_codegen.py
+++ b/benchmark/opencl_codegen.py
@ -1,84 +0,0 @@
 import os
 import sys
 major_py_ver = sys.version_info.major
 def convert_string_to_hex_list(code_str):
    hex_list = []
    for i in range(len(code_str)):
        hex_ = hex(ord(code_str[i]))
        hex_list.append(hex_)
    return hex_list
 def opencl_codegen():
    cl_kernel_dir = sys.argv[1]
    output_path = sys.argv[2]
    print("Generating OpenCL Kernels in "+cl_kernel_dir+" to "+output_path)
    if not os.path.exists(cl_kernel_dir):
        print(cl_kernel_dir + " doesn't exist!")
 #common.h
    common_header_code = ""
 #quantized_common.h
    quantized_common_header_code = ""
 #activation_common.h
    activation_common_header_code = ""
    for file_name in os.listdir(cl_kernel_dir):
        file_path = os.path.join(cl_kernel_dir, file_name)
        if file_path[-2:] == ".h" and file_name[:-2] == "quantized_common":
            with open(file_path, "r") as f:
                quantized_common_header_code += f.read()
        elif file_path[-2:] == ".h" and file_name[:-2] == "activation_common":
            with open(file_path, "r") as f:
                activation_common_header_code += f.read()
    opencl_code_maps = {}
    for file_name in os.listdir(cl_kernel_dir):
        file_path = os.path.join(cl_kernel_dir, file_name)
        if file_path[-3:] == ".cl":
            with open(file_path, "r") as f:
                code_str = ""
                for line in f.readlines():
                    if "#include <activation_common.h>" in line:
                        code_str += common_header_code
                        code_str += activation_common_header_code
                    elif "#include <quantized_common.h>" in line:
                        code_str += common_header_code
                        code_str += quantized_common_header_code
                    elif "#include <common.h>" in line:
                        code_str += common_header_code
                    else:
                        code_str += line
                opencl_code_maps[file_name[:-3]] = convert_string_to_hex_list(code_str)
 #source model
    opencl_source_map = "#include <map> \n"
    opencl_source_map += "#include <string> \n"
    opencl_source_map += "#include <vector> \n"
    opencl_source_map += "namespace MNN { \n"
    opencl_source_map += "extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap = \n { \n"
    if major_py_ver == 2:
        items = opencl_code_maps.iteritems()
    else:
        items = opencl_code_maps.items()
    for file_name, file_source in items:
        opencl_source_map += "{\n \""
        opencl_source_map += file_name
        opencl_source_map += "\", \n"
        opencl_source_map += "     { "
        for source_hex in file_source:
            opencl_source_map += source_hex
            opencl_source_map += ","
        opencl_source_map += " } "
        opencl_source_map += "\n }, \n"
    opencl_source_map += " }; \n"
    opencl_source_map += "} \n"
    with open(output_path, "w") as w_file:
        w_file.write(opencl_source_map)
    print("Generate OpenCL Source done !!! \n")
 if __name__ == '__main__':
    opencl_codegen()
--- a/ciscripts/build.sh
+++ b/ciscripts/build.sh
@ -0,0 +1,140 @@
 #!/usr/bin/env bash
 . ./parse_options.sh || exit 1;
 CMAKE=cmake
 MAKE=make
 ANDROID_NDK=/home/android-ndk-r18b
 BUILD_ROOT=`pwd`
 # Clean the exist directory other than remove it in order to solve
 # the problem "Current working directory cannot be established".
 function make_or_clean_dir {
  if [ -d $1 ]; then
    rm -rf $1/*
  else
    mkdir $1
  fi
 }
 function build_arm_android_32 {
  make_or_clean_dir build_arm_android_32 && cd build_arm_android_32
  $CMAKE ../.. \
      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
      -DCMAKE_BUILD_TYPE=Release \
      -DANDROID_ABI="armeabi-v7a" \
      -DANDROID_STL=c++_static \
      -DCMAKE_BUILD_TYPE=Release \
      -DANDROID_NATIVE_API_LEVEL=android-21 \
      -DANDROID_TOOLCHAIN=clang \
      -DMNN_USE_LOGCAT=true \
      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
      -DNATIVE_LIBRARY_OUTPUT=. \
      -DNATIVE_INCLUDE_OUTPUT=. \
      -DMNN_VULKAN=$USE_VULKAN \
      -DMNN_OPENCL=$USE_OPENCL \
      -DMNN_OPENGL=$USE_OPENGL \
      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
  $MAKE -j $build_threads  || exit 1;
  cd $BUILD_ROOT; true;
 }
 function build_arm_android_64 {
  make_or_clean_dir build_arm_android_64 && cd build_arm_android_64
  $CMAKE ../.. \
      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
      -DCMAKE_BUILD_TYPE=Release \
      -DANDROID_ABI="arm64-v8a" \
      -DANDROID_STL=c++_static \
      -DCMAKE_BUILD_TYPE=Release \
      -DANDROID_NATIVE_API_LEVEL=android-21 \
      -DANDROID_TOOLCHAIN=clang \
      -DMNN_USE_LOGCAT=true \
      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
      -DNATIVE_LIBRARY_OUTPUT=. \
      -DNATIVE_INCLUDE_OUTPUT=. \
      -DMNN_ARM82=ON \
      -DMNN_VULKAN=$USE_VULKAN \
      -DMNN_OPENCL=$USE_OPENCL \
      -DMNN_OPENGL=$USE_OPENGL \
      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
  $MAKE -j $build_threads || exit 1;
  cd $BUILD_ROOT; true;
 }
 function build_arm_linux_32 {
  cd $BUILD_ROOT; true;
 }
 function build_arm_linux_64 {
  cd $BUILD_ROOT; true;
 }
 function build_x86_linux {
  make_or_clean_dir build_x86_linux && cd build_x86_linux
  $CMAKE ../.. \
      -DCMAKE_BUILD_TYPE=Release \
      -DMNN_BUILD_TRAIN=ON \
      -DMNN_SEP_BUILD=OFF \
      -DMNN_BUILD_DEMO=ON \
      -DMNN_BUILD_QUANTOOLS=ON \
      -DMNN_EVALUATION=ON \
      -DMNN_BUILD_CONVERTER=ON \
      -DMNN_SUPPORT_TFLITE_QUAN=ON \
      -DMNN_BUILD_TEST=ON \
      -DMNN_OPENCL=$USE_OPENCL \
      -DMNN_VULKAN=$USE_VULKAN \
      -DMNN_OPENMP=$USE_OPENMP \
      -DMNN_USE_THREAD_POOL=OFF \
      -DMNN_BUILD_BENCHMARK=ON  || exit 1;
  $MAKE -j $build_threads || exit 1;
  cd $BUILD_ROOT; true;
 }
 function build_all {
  build_arm_android_32 || exit 1;
  build_arm_android_64 || exit 1;
  build_arm_linux_32 || exit 1;
  build_arm_linux_64 || exit 1;
  build_x86_linux || exit 1;
  true;
 }
 function clean {
  rm -rf build_arm_android_32
  rm -rf build_arm_android_64
  rm -rf build_arm_linux_32
  rm -rf build_arm_linux_64
  rm -rf build_x86_linux
 }
 function build {
  case $platform in
    "arm_linux_32")
      build_arm_linux_32 || exit 1;
      ;;
    "arm_linux_64")
      build_arm_linux_64 || exit 1;
      ;;
    "x86_linux")
      build_x86_linux || exit 1;
      ;;
    "arm_android_32")
      build_arm_android_32 || exit 1;
      ;;
    "arm_android_64")
      build_arm_android_64 || exit 1;
      ;;
    "all")
      build_all || exit 1;
      ;;
  *) echo "Invalid platform: $platform" && exit 1;
  esac
 }
 if [ $clean == 1 ]; then
  clean
 else
  build $@
 fi
 true;
--- a/ciscripts/parse_options.sh
+++ b/ciscripts/parse_options.sh
@ -0,0 +1,113 @@
 #!/usr/bin/env bash
 # Valid platform:
 #   - arm_android_32
 #   - arm_android_64
 #   - arm_linux_32
 #   - arm_linux_64
 #   - x86_linux
 platform="all"
 # Option to build with opencl.
 use_opencl=0
 # Option to build with opengl.
 use_opengl=0
 # Option to build with vulkan.
 use_vulkan=0
 # Option to build with openmp multithreads library.
 use_openmp=0
 build_threads=1
 # Option to clear the build history.
 clean=0
 USE_OPENCL=OFF
 USE_VULKAN=OFF
 USE_OPENGL=OFF
 USE_OPENMP=OFF
 USE_THREAD_POOL=ON
 function print_usage {
  echo -e "Usgae: ./build.sh"
  echo -e "  --platform=x: Specify build platform x. "
  echo -e "      All valid platforms are \"arm_android_32\", \"arm_android_64\",
                \"arm_linux_32\", \"arm_linux_64\", \"x86_linux\", \"all\"."
  echo -e "      The default is \"all\"."
  echo -e "  --use_openmp=true|false: Build with openmp or not."
  echo -e "      The default is false."
  echo -e "  --use_opencl=true|false: Build with opencl or not."
  echo -e "      The default is false."
  echo -e "  --use_opengl=true|false: Build with opengl or not."
  echo -e "      The default is false."
  echo -e "  --use_vulkan=true|false: Build with vulkan or not."
  echo -e "      The default is false."
  echo -e "  --job=n: Build with n threads. Default is 1."
 }
 function parse_platform {
  platform=`echo "$1" | awk -F '=' '{print $2}'`
 }
 function parse_nthreads {
  build_threads=`echo "$1" | awk -F '=' '{print $2}'`
 }
 function parse_bool {
  val=`echo "$1" | awk -F '=' '{print $2}'`
  if [ $val == "true" ] || [ $val == "1" ]; then
    return 1;
  else
    return 0;
  fi
 }
 [ -z "${1:-}" ] && print_usage && exit 1;
 while true; do
  [ -z "${1:-}" ] && break;
  case "$1" in
    --platform=*) parse_platform "$1"; shift 1;
      ;;
    --use_openmp=*) parse_bool "$1"; use_openmp=$?; shift 1;
      ;;
    --use_openmp) use_openmp=true; shift 1;
      ;;
    --use_opencl=*) parse_bool "$1"; use_opencl=$?; shift 1;
      ;;
    --use_opencl) use_opencl=true; shift 1;
      ;;
    --use_opengl=*) parse_bool "$1"; use_opengl=$?; shift 1;
      ;;
    --use_opengl) use_opengl=true; shift 1;
      ;;
    --use_vulkan=*) parse_bool "$1"; use_vulkan=$?; shift 1;
      ;;
    --use_vulkan) use_vulkan=true; shift 1;
      ;;
    --job=*) parse_nthreads "$1"; shift 1;
      ;;
    clean) clean=1; shift 1;
      ;;
    *) break;
  esac
 done
 if [ $use_opencl == 1 ]; then
  USE_OPENCL=ON
 fi
 if [ $use_opengl == 1 ]; then
  USE_OPENGL=ON
 fi
 if [ $use_vulkan == 1 ]; then
  USE_VULKAN=ON
 fi
 if [ $use_openmp == 1 ]; then
  USE_OPENMP=ON
  USE_THREAD_POOL=OFF
 fi
 true;
--- a/cmake/windows_x64_travis.bat
+++ b/cmake/windows_x64_travis.bat
@ -0,0 +1,3 @@
 call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars64.bat"
 cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
 ninja
--- a/cmake/windows_x86_travis.bat
+++ b/cmake/windows_x86_travis.bat
@ -0,0 +1,3 @@
 call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars32.bat"
 cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
 ninja
--- a/demo/exec/CMakeLists.txt
+++ b/demo/exec/CMakeLists.txt
@ -12,3 +12,9 @@ target_link_libraries(segment.out ${MNN_DEPS})
 add_executable(expressDemo.out ${CMAKE_CURRENT_LIST_DIR}/expressDemo.cpp)
 target_link_libraries(expressDemo.out ${MNN_DEPS})
 add_executable(transformerDemo.out ${CMAKE_CURRENT_LIST_DIR}/transformerDemo.cpp)
 target_link_libraries(transformerDemo.out ${MNN_DEPS})
 add_executable(rasterDemo.out ${CMAKE_CURRENT_LIST_DIR}/rasterDemo.cpp)
 target_link_libraries(rasterDemo.out ${MNN_DEPS})
--- a/demo/exec/expressDemo.cpp
+++ b/demo/exec/expressDemo.cpp
@ -53,7 +53,6 @@ int main(int argc, const char* argv[]) {
        MNN_ERROR("Output Not valid\n");
        return 0;
    }
    auto size = outputInfo->size;
    //Test Speed
    if (testTime > 0){
        //Let the frequence up
@ -82,6 +81,7 @@ int main(int argc, const char* argv[]) {
    }
    {
        auto size = outputInfo->size;
        auto outputPtr = output->readMap<float>();
        if (nullptr == outputPtr) {
            MNN_ERROR("Output Not valid read error\n");
--- a/demo/exec/rasterDemo.cpp
+++ b/demo/exec/rasterDemo.cpp
@ -0,0 +1,251 @@
 //
 //  rasterDemo.cpp
 //  MNN
 //
 //  Created by MNN on 2020/10/14.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include <fstream>
 #include <sstream>
 #include <iostream>
 #include <chrono>
 #include <MNN/MNNDefine.h>
 #include <MNN/Tensor.hpp>
 #include <MNN/Interpreter.hpp>
 #include "MNN_generated.h"
 #include "core/TensorUtils.hpp"
 #include "core/Execution.hpp"
 #include "core/Backend.hpp"
 #include "rapidjson/document.h"
 #include "rapidjson/stringbuffer.h"
 #include "rapidjson/writer.h"
 using namespace MNN;
 /*
 1.Raster will do the index mapping like below:
    for (region : regions)
        src = region.src, dst = region.dst;
    for (i = 0 -> size[0])
    for (j = 0 -> size[1])
    for (k = 0 -> size[2])
        output[dst.offset + i * dst.stride[0] + j * dst.stride[1] + k * dst.stride[2]] =
        region.origion[src.offset + i * src.stride[0] + j * src.stride[1] + k * src.stride[2]];
 2. Raster Op has a input and a output, but the input is not the real input tensor, it's a
    middle tensor whith VIRTUAL type that has many regions point to inputs tensors, like below.
                input_0 --> region_0 --\
                                        \
                input_1 --> region_1 ---- middle ----> output
                                        /
                input_2 --> region_2 --/
 3. This example read a json file and construct some Rasters and compute.
    The input json file format is as below:
    {
       "inputs" : [
           {
               "id" : int,
               "type" : "type_name", // float or int
               "dims" : [int],
               "data" : [int/float] // if null, fill with random number
           }
       ],
       "outputs" : [
           // same with inputs
       ],
       "regions" : [
           {
               "id" : int, // points to outputs
               "size" : [int],
               "src" : {
                   "offset" : int,
                   "stride" : [int]
               },
               "dst" : { // same with src },
               "origin" : int // point to inputs
           }
       ]
    }
 */
 static std::string runRaster(std::string jsonString, int runNum) {
    srand(0);
    rapidjson::Document document;
    document.Parse(jsonString.c_str());
    if (document.HasParseError()) {
        MNN_ERROR("Invalid Json Format!\n");
        return 0;
    }
    // prepare CPU backend
    ScheduleConfig config;
    config.type = MNN_FORWARD_CPU;
    BackendConfig backendConfig;
    backendConfig.precision = BackendConfig::Precision_High;
    config.backendConfig = &backendConfig;
    Backend::Info compute;
    compute.type = config.type;
    compute.numThread = config.numThread;
    compute.user = config.backendConfig;
    const RuntimeCreator* runtimeCreator(MNNGetExtraRuntimeCreator(compute.type));
    std::unique_ptr<Runtime> runtime(runtimeCreator->onCreate(compute));
    std::unique_ptr<Backend> backend(runtime->onCreate());
    // build Op
    std::unique_ptr<OpT> opt(new OpT);
    opt->type = OpType_Raster;
    flatbuffers::FlatBufferBuilder builder(1024);
    builder.ForceDefaults(true);
    auto len = Op::Pack(builder, opt.get());
    builder.Finish(len);
    auto buffer = builder.GetBufferPointer();
    const Op* op = flatbuffers::GetMutableRoot<Op>(buffer);
    // build tensors (NCHW) from json
    std::vector<std::unique_ptr<Tensor>> inputs;
    std::vector<std::unique_ptr<Tensor>> outputs;
    auto readTensors = [&document, &backend](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
        if (document.HasMember(type)) {
            auto info = document[type].GetArray();
            tensors.resize(info.Size());
            for (auto iter = info.begin(); iter != info.end(); iter++) {
                auto obj = iter->GetObject();
                int id = obj["id"].GetInt();
                tensors[id].reset(new Tensor(4));
                auto tensor = tensors[id].get();
                auto dataType = obj["type"].GetString();
                bool isFloat = !strcmp(dataType, "float");
                tensor->setType(isFloat ? DataType_DT_FLOAT : DataType_DT_INT32);
                auto dims = obj["dims"].GetArray();
                for (auto d = dims.begin(); d != dims.end(); d++) {
                    tensor->setLength(d - dims.begin(), d->GetInt());
                }
                TensorUtils::setLinearLayout(tensor);
                backend->onAcquireBuffer(tensor, Backend::STATIC);
                TensorUtils::getDescribe(tensor)->backend = backend.get();
                auto data = obj["data"].GetArray();
                if (!strcmp(type, "inputs")) {
                    bool hasData = data.Size() == tensor->elementSize();
                    auto dataIter = data.begin();
                    for (int i = 0; i < tensor->elementSize(); i++, dataIter++) {
                        if (isFloat) {
                            tensor->host<float>()[i] = hasData ? dataIter->GetFloat() : rand() % 10 / 10.0;
                        } else {
                            tensor->host<int>()[i] = hasData ? dataIter->GetInt() : rand() % 10;
                        }
                    }
                }
            }
        }
    };
    readTensors(inputs, "inputs");
    readTensors(outputs, "outputs");
    // build middle tensors' region info from json
    std::vector<std::unique_ptr<Tensor>> middles;
    middles.resize(outputs.size());
    if (document.HasMember("regions")) {
        auto info = document["regions"].GetArray();
        for (auto iter = info.begin(); iter != info.end(); iter++) {
            auto obj = iter->GetObject();
            int id = obj["id"].GetInt();
            if (middles[id] == nullptr) {
                middles[id].reset(new Tensor(4));
            }
            auto des = TensorUtils::getDescribe(middles[id].get());
            des->memoryType = MNN::Tensor::InsideDescribe::MEMORY_VIRTUAL;
            Tensor::InsideDescribe::Region region;
            int origin = obj["origin"].GetInt();
            region.origin = inputs[origin].get();
            auto size = obj["size"].GetArray();
            auto src = obj["src"].GetObject();
            auto dst = obj["dst"].GetObject();
            auto srcStride = src["stride"].GetArray();
            auto dstStride = dst["stride"].GetArray();
            for (int i = 0; i < 3; i++) {
                region.size[i] = size[i].GetInt();
                region.src.stride[i] = srcStride[i].GetInt();
                region.dst.stride[i] = dstStride[i].GetInt();
            }
            region.src.offset = src["offset"].GetInt();
            region.dst.offset = dst["offset"].GetInt();
            des->regions.push_back(region);
        }
    }
    // build execution of Raster and run them
    for (int i = 0; i < outputs.size(); i++) {
        std::vector<Tensor*> ins = {middles[i].get()}, outs = {outputs[i].get()};
        std::unique_ptr<Execution> exe(backend->onCreate(ins, outs, op));
        exe->onResize(ins, outs);
        auto t1 = std::chrono::high_resolution_clock::now();
        for (int j = 0; j < runNum; j++) {
            exe->onExecute(ins, outs);
        }
        auto t2 = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
        double time = time_span.count() * 1000.0 / runNum;
        printf("For output_id = %d, run %d times, the average time is %f ms.\n", i, runNum, time);
    }
    auto writeTensors = [&document](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
        auto info = document[type].GetArray();
        for (auto iter = info.begin(); iter != info.end(); iter++) {
            auto obj = iter->GetObject();
            int id = obj["id"].GetInt();
            auto data = obj["data"].GetArray();
            if (data.Size() == tensors[id]->elementSize()) {
                // has data, dont write
                return;
            }
            bool isFloat = !strcmp(obj["type"].GetString(), "float");
            data.Reserve(tensors[id]->elementSize(), document.GetAllocator());
            for (int i = 0; i < tensors[id]->elementSize(); i++) {
                if (isFloat) {
                    data.PushBack(tensors[id]->host<float>()[i], document.GetAllocator());
                } else {
                    data.PushBack(tensors[id]->host<int>()[i], document.GetAllocator());
                }
            }
        }
    };
    writeTensors(inputs, "inputs");
    writeTensors(outputs, "outputs");
    rapidjson::StringBuffer stringBuffer;
    rapidjson::Writer<rapidjson::StringBuffer> writer(stringBuffer);
    document.Accept(writer);
    return stringBuffer.GetString();
 }
 int main(int argc, const char* argv[]) {
    if (argc < 2) {
        printf("Usage: ./rasterDemo.out input.json [output.json] [runNum]\ndefault output is input, and default runNum is 100.\n");
        return 0;
    }
    const char* inputFile = argv[1];
    const char* outputFile = argv[1];
    int runNum = 100;
    if (argc >= 3) {
        outputFile = argv[2];
    }
    if (argc >= 4) {
        runNum = ::atoi(argv[3]);
    }
    std::ifstream in(inputFile);
    if (in.fail()) {
        printf("Invalid input Json File!\n");
        return 0;
    }
    std::ofstream out(outputFile);
    if (out.fail()) {
        printf("Invalid output Json File!\n");
        return 0;
    }
    std::stringstream ss;
    ss << in.rdbuf();
    out << runRaster(ss.str(), runNum);
    out.close();
    printf("Run Raster Done!\n");
    return 0;
 }
--- a/demo/exec/transformerDemo.cpp
+++ b/demo/exec/transformerDemo.cpp
@ -0,0 +1,60 @@
 #include <MNN/expr/Module.hpp>
 #define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/expr/Executor.hpp>
 #include <fstream>
 #include <sstream>
 #include <stdio.h>
 #include<string.h>
 using namespace MNN::Express;
 using namespace MNN;
 using namespace std;
 int main(int argc, const char* argv[]) {
    if (argc < 2) {
        MNN_ERROR("Don't has model name\n");
        return 0;
    }
    BackendConfig config;
    //Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 4);
    auto modelName = argv[1];
    std::shared_ptr<Module> model;
    model.reset(Module::load({"NmtModel/Placeholder", "NmtModel/Placeholder_1"}, {"NmtModel/transpose_2"}, modelName));
    std::vector<int> input0 = {32,16,234,3215,61,135,29,10,24317,4661,4,0};
    std::vector<int> input1 = {1,1,1,1,1,1,1,1,1,1,1,1};
    auto first = _Input({1, (int)input0.size()}, NHWC, halide_type_of<int>());
    ::memcpy(first->writeMap<int>(), input0.data(), input0.size() * sizeof(int));
    auto second = _Input({1, (int)input1.size()}, NHWC, halide_type_of<int>());
    ::memcpy(second->writeMap<int>(), input1.data(), input1.size() * sizeof(int));
    std::vector<VARP> outputs;
    for (int i = 0; i < 2; ++i) {
        {
            AUTOTIME;
            Executor::getGlobalExecutor()->resetProfile();
            outputs = model->onForward({first, second});
            Executor::getGlobalExecutor()->dumpProfile();
        }
        std::ostringstream fileNameOs;
        std::ostringstream dimInfo;
        fileNameOs << i << "_output.txt";
        auto info = outputs[0]->getInfo();
        for (int d=0; d<info->dim.size(); ++d) {
            dimInfo << info->dim[d] << "_";
        }
        auto fileName = fileNameOs.str();
        MNN_PRINT("Output Name: %s, Dim: %s\n", fileName.c_str(), dimInfo.str().c_str());
        auto ptr = outputs[0]->readMap<int>();
        std::ofstream outputOs(fileName.c_str());
        for (int i=0; i<info->size; ++i) {
            outputOs << ptr[i] << "\n";
        }
    }
    for (int i = 0; i < 10; ++i) {
        AUTOTIME;
        outputs = model->onForward({first, second});
    }
    return 0;
 }
--- a/demo/iOS/playground/ViewController.mm
+++ b/demo/iOS/playground/ViewController.mm
@ -53,27 +53,23 @@ static int CompareElements(const LabeledElement *a, const LabeledElement *b) {
    if (!_net || !_session) {
        return nil;
    }
    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
    MNN::Tensor copy(output);
    auto input = _net->getSessionInput(_session, nullptr);
    MNN::Tensor tensorCache(input);
    input->copyToHostTensor(&tensorCache);
    // run
    NSTimeInterval begin = NSDate.timeIntervalSinceReferenceDate;
    // you should set input data for each inference
    if (cycles == 1) {
        _net->runSession(_session);
    } else {
        auto input = _net->getSessionInput(_session, nullptr);
        MNN::Tensor tensorCache(input);
        input->copyToHostTensor(&tensorCache);
    for (int i = 0; i < cycles; i++) {
        input->copyFromHostTensor(&tensorCache);
        _net->runSession(_session);
-        }
+        output->copyToHostTensor(&copy);
    }
    NSTimeInterval cost = NSDate.timeIntervalSinceReferenceDate - begin;
    // result
    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
    MNN::Tensor copy(output);
    output->copyToHostTensor(&copy);
    float *data = copy.host<float>();
    LabeledElement objects[1000];
    for (int i = 0; i < 1000; i++) {
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@ -1,14 +1,21 @@
-file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
 option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
 option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
 IF (MNN_EXPR_ENABLE_PROFILER)
    add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
 ENDIF()
 IF (MNN_EXPR_SHAPE_EAGER)
    add_definitions(-DMNN_EXPR_SHAPE_EAGER)
 ENDIF()
 IF(MNN_SEP_BUILD)
    if (MNN_BUILD_FOR_ANDROID_COMMAND)
        set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../")
    endif()
    add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
    target_link_libraries(MNN_Express MNN)
    if (MNN_BUILD_MINI)
        target_link_libraries(MNN_Express $<TARGET_OBJECTS:MNNTransform>)
    endif()
 ELSE()
    add_library(MNNExpress OBJECT ${MNN_EXPR_SRCS})
 ENDIF()
--- a/express/Distributions.cpp
+++ b/express/Distributions.cpp
@ -0,0 +1,30 @@
 //
 //  Distributions.cpp
 //  MNN
 //
 //  Created by MNN on 2019/11/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "Distributions.hpp"
 #include <cmath>
 namespace MNN {
 namespace Express {
 void Distributions::uniform(const int count, const float min, const float max, float *r, std::mt19937 gen) {
    std::uniform_real_distribution<float> dis(min, std::nextafter(max, std::numeric_limits<float>::max()));
    for (int i = 0; i < count; i++) {
        r[i] = dis(gen);
    }
 }
 void Distributions::gaussian(const int count, const float mu, const float sigma, float *r, std::mt19937 gen) {
    std::normal_distribution<float> dis(mu, sigma);
    for (int i = 0; i < count; i++) {
        r[i] = dis(gen);
    }
 }
 } // namespace Express
 } // namespace MNN
--- a/express/Distributions.hpp
+++ b/express/Distributions.hpp
@ -0,0 +1,27 @@
 //
 //  Distributions.hpp
 //  MNN
 //
 //  Created by MNN on 2019/11/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef Distributions_hpp
 #define Distributions_hpp
 #include <MNN/MNNDefine.h>
 #include <random>
 namespace MNN {
 namespace Express {
 class Distributions {
 public:
    static void uniform(const int count, const float min, const float max, float* r, std::mt19937 gen);
    static void gaussian(const int count, const float mu, const float sigma, float* r, std::mt19937 gen);
 };
 } // namespace Express
 } // namespace MNN
 #endif // Distritutions_hpp
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
--- a/express/ExecutorScope.cpp
+++ b/express/ExecutorScope.cpp
@ -0,0 +1,45 @@
 //
 //  ExecutorScope.cpp
 //  MNN
 //
 //  Created by MNN on 2020/10/26.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include <thread>
 #include <MNN/expr/Executor.hpp>
 #include <MNN/expr/Scope.hpp>
 #include <MNN/expr/ExecutorScope.hpp>
 namespace MNN {
 namespace Express {
 typedef std::shared_ptr<Express::Executor> ExecutorRef;
 #if !defined(__APPLE__)
 thread_local static Scope<ExecutorRef> g_executor_scope;
 #else
 static Scope<ExecutorRef> g_executor_scope;
 #endif
 ExecutorScope::ExecutorScope(const std::shared_ptr<Executor>& current) {
    g_executor_scope.EnterScope(current);
 }
 ExecutorScope::ExecutorScope(const std::string& scope_name,
                             const std::shared_ptr<Executor>& current) {
    g_executor_scope.EnterScope(scope_name, current);
 }
 ExecutorScope::~ExecutorScope() {
    g_executor_scope.ExitScope();
 }
 const std::shared_ptr<Executor> ExecutorScope::Current() {
    if (g_executor_scope.ScopedLevel() > 0) {
        return g_executor_scope.Current().content;
    }
    return Executor::getGlobalExecutor();
 }
 }  // namespace Express
 }  // namespace MNN
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -8,23 +8,33 @@
 #define FLATBUFFERS_PREFER_PRINTF
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/Executor.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <map>
 #include "core/MNNMemoryUtils.h"
 #include "Utils.hpp"
 #include <map>
 #include "core/FileLoader.hpp"
-#include <MNN/expr/Executor.hpp>
+#include "core/TensorUtils.hpp"
 #include "MNN_generated.h"
 //#define MNN_OPEN_TIME_TRACE
 #include "MNN/AutoTime.hpp"
 #include "MNN/expr/ExecutorScope.hpp"
 //#define MNN_EXPRESS_ERROR_REPORT
 static inline std::string numberToString(int index) {
    char s[10];
    snprintf(s, 10, "%d", index);
    return std::string(s);
 }
 static bool HasUnknownDim(const std::vector<int>& dims) {
    for (const int& dim : dims) {
        if (dim < 0) {
            return true;
        }
    }
    return false;
 }
 namespace MNN {
 namespace Express {
 void Variable::Info::syncSize() {
@ -87,8 +97,7 @@ bool VARP::fix(VARP::InputType type) const {
 }
 Expr::Expr(int outputSize) {
-    mInside.reset(new Inside);
+    mInside.reset(new Inside(outputSize));
    mInside->mOutputInfos.resize(outputSize);
    mOutputNames.resize(outputSize);
 }
@ -117,27 +126,46 @@ void Expr::_addLinkForInputs(EXPRP expr) {
        }
    }
 }
-EXPRP Expr::create(Variable::Info&& info) {
+EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy) {
    EXPRP expr(new Expr(1));
    expr->mOp = nullptr;
-    auto originPtr = info.ptr;
+    auto originPtr = ptr;
    expr->mInside->mOutputInfos[0] = std::move(info);
    auto& dstInfo = expr->mInside->mOutputInfos[0];
    dstInfo.syncSize();
    if (dstInfo.size > 0) {
        expr->mExtraBuffer.reset(new char[dstInfo.size * dstInfo.type.bytes()], std::default_delete<char[]>());
        expr->mInside->mOutputInfos[0].ptr = expr->mExtraBuffer.get();
    expr->mInside->mInfoDirty = false;
    dstInfo.syncSize();
    Utils::copyInfoToTensor(expr->mInside->mOutputTensors[0], expr->mInside->mOutputInfos.data());
    expr->mType = type;
    if (type == VARP::CONSTANT) {
        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::CONSTANT;
    } else if (type == VARP::INPUT) {
        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::INPUT;
    } else {
-        expr->mInside->mOutputInfos[0].ptr = nullptr;
+        // VARP::TRAINABLE
-        expr->mInside->mInfoDirty = true;
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::TRAINABLE;
    }
    if (dstInfo.size > 0 && copy) {
        auto res = Utils::allocMemoryForHostTensor(expr->mInside->mOutputTensors[0]);
        if (!res) {
            MNN_ASSERT(false);
            return nullptr;
        }
    } else {
        expr->mInside->mOutputTensors[0]->buffer().host = nullptr;
    }
    if (nullptr == originPtr) {
-        expr->mType = VARP::INPUT;
+        if (type == VARP::INPUT && dstInfo.size > 0) {
            expr->mInside->mContentDirty = true;
        }
        return expr;
    }
-    expr->mType = VARP::CONSTANT;
+    expr->mInside->mContentDirty = false;
-    ::memcpy(expr->mInside->mOutputInfos[0].ptr, originPtr, dstInfo.size * dstInfo.type.bytes());
+    if (copy) {
        ::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
    } else {
        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
        expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
    }
    return expr;
 }
 EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize) {
@ -147,8 +175,7 @@ EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP
    expr->mOp = flatbuffers::GetMutableRoot<Op>(extra.first.get());
    expr->mOpBufferSize = extra.second;
    expr->mInputs   = std::move(inputs);
-    expr->mInside->mInputInfos.resize(expr->mInputs.size());
+    expr->mInside->mReq = ExecutorScope::Current()->getRequirement(expr.get());
    expr->mInside->mReq = Executor::getGlobalExecutor()->getRequirement(expr.get());
    _addLinkForInputs(expr);
    return expr;
 }
@ -161,34 +188,34 @@ EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
            info.dim[0] = 1;
        }
        info.order = Utils::revertFormat(op->main.AsInput()->dformat);
        info.ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsInput()->dtype);
-        return create(std::move(info));
+        return create(std::move(info), nullptr, VARP::INPUT);
    }
    if (OpType_Const == op->type || OpType_TrainableParam == op->type) {
        Variable::Info info;
        info.dim = op->main.AsBlob()->dims;
        info.order = Utils::revertFormat(op->main.AsBlob()->dataFormat);
-        info.ptr = nullptr;
+        void* ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsBlob()->dataType);
        switch (op->main.AsBlob()->dataType) {
            case DataType_DT_INT8:
-                info.ptr = (void*)op->main.AsBlob()->int8s.data();
+                ptr = (void*)op->main.AsBlob()->int8s.data();
                break;
            case DataType_DT_INT32:
-                info.ptr = (void*)op->main.AsBlob()->int32s.data();
+                ptr = (void*)op->main.AsBlob()->int32s.data();
                break;
            case DataType_DT_UINT8:
-                info.ptr = (void*)op->main.AsBlob()->uint8s.data();
+                ptr = (void*)op->main.AsBlob()->uint8s.data();
                break;
            case DataType_DT_FLOAT:
-                info.ptr = (void*)op->main.AsBlob()->float32s.data();
+                ptr = (void*)op->main.AsBlob()->float32s.data();
                break;
            default:
                break;
        }
-        auto expr = create(std::move(info));
+        //MNN_ASSERT(nullptr != ptr);
-        if (OpType_TrainableParam == op->type) {
+        auto expr = create(std::move(info), ptr, VARP::CONSTANT);
        if (OpType_TrainableParam == op->type && nullptr != ptr) {
            expr->mType = VARP::TRAINABLE;
        }
        return expr;
@ -213,7 +240,7 @@ bool Expr::requireInfo() {
        return false;
    }
    if (nullptr == mOp) {
-        return mInside->mOutputInfos[0].size > 0;
+        return !HasUnknownDim(mInside->mOutputInfos[0].dim);
    }
    bool ready     = true;
    for (int i = 0; i < mInputs.size(); ++i) {
@ -221,8 +248,8 @@ bool Expr::requireInfo() {
            // The Variable is set nullptr by api
            return false;
        }
-        mInside->mInputInfos[i] = mInputs[i]->getInfo();
+        auto inputInfo = mInputs[i]->getInfo();
-        if (nullptr == mInside->mInputInfos[i] && (!mInside->mReq.supportError[i])) {
+        if (nullptr == inputInfo) {
 #ifdef MNN_EXPRESS_ERROR_REPORT
            MNN_ERROR("%s, %d input not ready\n", mName.c_str(), i);
 #endif
@ -233,15 +260,19 @@ bool Expr::requireInfo() {
    for (int i = 0; i < mInputs.size(); ++i) {
        auto& v  = mInputs[i];
        if (mInside->mReq.shapeNeedContent[i]) {
-            // `readInternal` maybe return nullptr if element count is 0.
+            // For shape need content, the content must not be nullptr
-            v->readInternal(true);
+            auto ptr = v->readInternal(true);
            if (nullptr == ptr) {
                ready = false;
                break;
            }
        }
    }
    if (!ready) {
        return false;
    }
    //MNN_PRINT("Info %s, %p Start\n", mName.c_str(), this);
-    auto res   = Executor::getGlobalExecutor()->computeInfo(this);
+    auto res   = ExecutorScope::Current()->computeInfo(this);
    //MNN_PRINT("Info Compute %s\n", mName.c_str());
    if (NO_ERROR == res) {
@ -261,6 +292,14 @@ const std::vector<WeakEXPRP>& Variable::toExprs() const {
 VARP Variable::create(EXPRP expr, int index) {
    VARP res(new Variable(expr, index));
 #ifdef MNN_EXPR_SHAPE_EAGER
    auto info = expr->requireInfo();
    if (!info) {
 #ifdef MNN_EXPRESS_ERROR_REPORT
        MNN_ERROR("Can't compute shape\n");
 #endif
    }
 #endif
    return res;
 }
 void Expr::replace(EXPRP old, EXPRP from) {
@ -307,16 +346,22 @@ void Expr::replace(EXPRP old, EXPRP from) {
    old->mValid = from->mValid;
    old->mInside = from->mInside;
    old->mInputs = from->mInputs;
    std::vector<Expr*> visited;
    old->visitOutputs([&](EXPRP expr, int index) {
-        if (expr->mInside->mInfoDirty && expr->mValid && !expr->mInside->mLinkCache) {
+        if (expr->visited()) {
            return false;
        }
        visited.emplace_back(expr.get());
        expr->setVisited(true);
        expr->mInside->mCache.reset();
        expr->mInside->mCacheOffset = 0;
        expr->mValid = true;
        expr->mInside->mInfoDirty = true;
        return true;
    });
    for (auto e : visited) {
        e->setVisited(false);
    }
 }
 void Variable::setName(const std::string& name) {
@ -351,7 +396,7 @@ bool Variable::input(VARP src) {
        info = tempInfo.get();
    }
    auto dstInfo = getInfo();
-    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size();
+    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size() || info->type != dstInfo->type;
    if (!needChange) {
        for (int i=0; i<info->dim.size(); ++i) {
            if (dstInfo->dim[i] != info->dim[i]) {
@ -362,22 +407,19 @@ bool Variable::input(VARP src) {
    }
    if (!mFrom->mInside->mCache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, false);
+        ExecutorScope::Current()->makeCache({mFrom}, false);
    }
    if (needChange) {
        bool needAlloc = info->size * info->type.bytes() > mFrom->mInside->mOutputInfos[0].size * mFrom->mInside->mOutputInfos[0].type.bytes();
        mFrom->mInside->mOutputInfos[0] = *info;
-        if (needAlloc) {
+        Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
-            mFrom->mExtraBuffer.reset(new char[info->size * info->type.bytes()], std::default_delete<char[]>());
+        Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
-        }
+        Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
        mFrom->mInside->mOutputInfos[0].ptr = mFrom->mExtraBuffer.get();
        mFrom->mInside->mCache->setShapeDirty(0, mFrom->outputInfo(0));
    }
    if (info->size) {
        auto dstPtr = writeInternal(false);
        auto srcPtr = src->readMap<void>();
        if (nullptr == dstPtr || nullptr == srcPtr) {
-            MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
+            //MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
            return false;
        }
        ::memcpy(dstPtr, srcPtr, info->size * info->type.bytes());
@ -387,7 +429,7 @@ bool Variable::input(VARP src) {
    } else {
        informDirty();
    }
-    mFrom->mInside->mCache->setContentReady();
+    mFrom->mInside->mContentDirty = false;
    return true;
 }
@ -396,23 +438,44 @@ void Variable::replace(VARP dst, VARP src) {
        dst->setExpr(nullptr, 0);
        return;
    }
    if (nullptr == dst) {
        dst.mContent = src.mContent;
        return;
    }
    if (src->mFrom.get() == dst->mFrom.get()) {
        dst->mFromIndex = src->mFromIndex;
        return;
    }
    if (src->mFrom->outputSize() != dst->mFrom->outputSize()) {
        // Can't replace Expr, Just replace VARP
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+        std::vector<Expr*> visited;
-            src->mFrom->mTo.emplace_back(expr);
+        dst->mFrom->visitOutputs([src, dst, &visited](EXPRP expr, int index) {
            if (expr->visited()) {
                return false;
-        });
+            }
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            expr->setVisited(true);
            visited.emplace_back(expr.get());
            expr->mInside->mCache.reset();
            expr->mInside->mCacheOffset = 0;
            expr->mValid = true;
            expr->mInside->mInfoDirty = true;
            expr->mInside->mContentDirty = true;
            return true;
        });
        for (auto v : visited) {
            v->setVisited(false);
        }
        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
            for (int i =0; i< expr->inputs().size(); ++i) {
                auto input = expr->inputs()[i];
                if (input == dst) {
                    expr->mInputs[i] = src;
                }
            }
            src->mFrom->mTo.emplace_back(expr);
            return false;
        });
        dst->mFrom = src->mFrom;
        dst->mFromIndex = src->mFromIndex;
        return;
@ -452,15 +515,19 @@ bool Variable::resize(INTS dims) {
    }
    info.dim = dims;
    info.syncSize();
-    mFrom->mExtraBuffer.reset(new char[info.size * info.type.bytes()], std::default_delete<char[]>());
+    Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
-    info.ptr = mFrom->mExtraBuffer.get();
+    Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
    if (0 >= info.size) {
        return false;
    }
    bool res = Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
    if (!res) {
        return false;
    }
    mFrom->mValid = true;
-    mFrom->mInside->mInputInfos.clear();
+    mFrom->inside()->mInfoDirty = false;
-    auto cache = mFrom->mInside->mCache;
+    mFrom->inside()->mContentDirty = true;
    if (nullptr != cache) {
        cache->setShapeDirty(0, mFrom->outputInfo(0));
    }
    mFrom->visitOutputs([](EXPRP expr, int index) { return expr->setInfoDirty(); });
    return true;
 }
@ -478,11 +545,12 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
 void* Variable::readInternal(bool forShape) {
    if (nullptr == mFrom->get()) {
        if (VARP::INPUT == mFrom->mType) {
-            if (nullptr == mFrom->mInside->mCache) {
+            if (mFrom->mInside->mContentDirty) {
                return nullptr;
            }
        }
-        return mFrom->outputInfo(mFromIndex)->ptr;
+        //MNN_ASSERT(nullptr != mFrom->inside()->mOutputTensors[0]->buffer().host);
        return mFrom->inside()->mOutputTensors[0]->buffer().host;
    }
    auto res = mFrom->requireInfo();
    if (false == res) {
@ -490,21 +558,26 @@ void* Variable::readInternal(bool forShape) {
    }
    auto cache = mFrom->inside()->mCache;
    if (nullptr == cache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, forShape);
+        ExecutorScope::Current()->makeCache({mFrom}, forShape);
        cache = mFrom->inside()->mCache;
    }
    if (nullptr == cache) {
        return nullptr;
    }
-    if (NO_ERROR != Executor::getGlobalExecutor()->runCache(cache)) {
+    if (NO_ERROR != ExecutorScope::Current()->runCache(cache)) {
        return nullptr;
    }
-    cache->syncOutput(mFrom->mInside->mCacheOffset + mFromIndex, mFrom->outputInfo(mFromIndex));
+    return Executor::mapOutput(cache.get(), mFrom->mInside->mCacheOffset + mFromIndex, mFrom->mInside->mOutputTensors[mFromIndex]);
    return mFrom->outputInfo(mFromIndex)->ptr;
 }
 void Variable::informDirty() {
-    mFrom->visitOutputs([](EXPRP expr, int index) {
+    std::vector<Expr*> visited;
    mFrom->visitOutputs([&visited](EXPRP expr, int index) {
        if (expr->visited()) {
            return false;
        }
        visited.emplace_back(expr.get());
        expr->setVisited(true);
        if (expr->inside()->mReq.shapeNeedContent.empty()) {
            // Not init
            return false;
@ -514,28 +587,32 @@ void Variable::informDirty() {
            expr->visitOutputs([](EXPRP e, int index) { return e->setInfoDirty(); });
            return false;
        }
        if (expr->inside()->mContentDirty) {
            return false;
        }
        expr->inside()->mContentDirty = true;
        if (expr->inside()->mReq.contentNeedContent[index]) {
            if (expr->inside()->mCache != nullptr) {
-                expr->inside()->mCache->setContentDirty();
+                Executor::setContentDirty(expr->inside()->mCache.get());
            }
            return true;
        }
        return false;
    });
    for (auto e : visited) {
        e->setVisited(false);
    }
 }
 void Variable::prepareCompute(const std::vector<VARP>& vars, bool forceCpu) {
    std::vector<EXPRP> exprs;
    for (auto v : vars) {
-        if (v->expr().first->inside()->mCache == nullptr) {
+        if (!v->expr().first->visited()) {
            v->expr().first->inside()->mCache = nullptr;
            v->expr().first->requireInfo();
            v->expr().first->setVisited(true);
            exprs.emplace_back(v->expr().first);
        }
    }
-    Executor::getGlobalExecutor()->makeCache(std::move(exprs), forceCpu);
+    for (auto v : vars) {
        v->expr().first->setVisited(false);
    }
    ExecutorScope::Current()->makeCache(std::move(exprs), forceCpu);
 }
 void* Variable::writeInternal(bool inform) {
@ -545,16 +622,8 @@ void* Variable::writeInternal(bool inform) {
    if (inform) {
        informDirty();
    }
-    auto cache = mFrom->mInside->mCache;
+    mFrom->mInside->mContentDirty = false;
-    if (nullptr == cache) {
+    return mFrom->inside()->mOutputTensors[0]->host<void>();
        Executor::getGlobalExecutor()->makeCache({mFrom});
        cache = mFrom->mInside->mCache;
    }
    if (nullptr == cache) {
        return nullptr;
    }
    mFrom->mInside->mCache->setContentReady();
    return mFrom->mInside->mOutputInfos[0].ptr;
 }
 void Variable::unMap() {
@ -591,12 +660,17 @@ bool Expr::setInfoDirty() {
    mInside->mContentDirty = true;
    mValid = true;
    if (mInside->mCache != nullptr) {
-        mInside->mCache->setShapeDirty(0, nullptr);
+        Executor::setShapeDirty(mInside->mCache.get());
    }
    for (auto o : mInside->mOutputTensors) {
        Utils::releaseMemoryForHostTensor(o);
    }
    return true;
 }
 std::vector<VARP> Variable::load(const char* fileName) {
    AutoStorage<uint8_t> buffer;
    {
        FileLoader loader(fileName);
        if (!loader.valid()) {
            MNN_ERROR("Error for open %s\n", fileName);
@ -606,11 +680,11 @@ std::vector<VARP> Variable::load(const char* fileName) {
        if (!loader.valid()) {
            return {};
        }
    AutoStorage<uint8_t> buffer;
        loader.merge(buffer);
        if (buffer.get() == nullptr) {
            return {};
        }
    }
    return load(buffer.get(), buffer.size());
 }
 std::vector<VARP> Variable::load(const uint8_t* buffer, size_t length) {
@ -722,6 +796,7 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        } else {
            MNN_ASSERT(1 == expr->outputSize());
            auto& info = expr->mInside->mOutputInfos[0];
            auto ptr = expr->mInside->mOutputTensors[0]->host<void>();
            op.reset(new OpT);
            if (expr->mType != VARP::INPUT) {
                auto blob        = new BlobT;
@ -730,16 +805,20 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                if (info.type.code == halide_type_float) {
                    blob->dataType = DataType_DT_FLOAT;
                    blob->float32s.resize(info.size);
-                    ::memcpy(blob->float32s.data(), info.ptr, info.size * sizeof(float));
+                    ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
-                } else if (info.type.code == halide_type_int) {
+                } else if (info.type.code == halide_type_int && info.type.bits == 32) {
                    blob->dataType = DataType_DT_INT32;
                    blob->int32s.resize(info.size);
-                    ::memcpy(blob->int32s.data(), info.ptr, info.size * sizeof(int));
+                    ::memcpy(blob->int32s.data(), ptr, info.size * sizeof(int));
-                }
+                } else if (info.type.code == halide_type_int && info.type.bits == 8) {
-                else if (info.type.code == halide_type_uint && info.type.bits == 8) {
+                    blob->dataType = DataType_DT_INT8;
                    blob->int8s.resize(info.size);
                    auto pptr = (int8_t *)ptr;
                    ::memcpy(blob->int8s.data(), ptr, info.size * sizeof(int8_t));
                } else if (info.type.code == halide_type_uint && info.type.bits == 8) {
                    blob->dataType = DataType_DT_UINT8;
                    blob->uint8s.resize(info.size);
-                    ::memcpy(blob->uint8s.data(), info.ptr, info.size * sizeof(uint8_t));
+                    ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(uint8_t));
                }
                op->type       = OpType_Const;
                if (expr->mType == VARP::TRAINABLE) {
@ -781,12 +860,12 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        auto op = dest->oplists[index].get();
        auto tensorIndexOffset = varIndexInfo[expr];
        for (int v=0; v<expr->outputSize(); ++v) {
-            auto const tensorIndex = tensorIndexOffset + v;
+            auto subindex = tensorIndexOffset + v;
-            if (dest->tensorName[tensorIndex].empty()) {
+            if (dest->tensorName[subindex].empty()) {
                if (v == 0) {
-                    dest->tensorName[tensorIndex] = op->name;
+                    dest->tensorName[subindex] = op->name;
                } else {
-                    dest->tensorName[tensorIndex] = op->name + numberToString(v);
+                    dest->tensorName[subindex] = op->name + numberToString(v);
                }
            }
        }
--- a/express/Initializer.cpp
+++ b/express/Initializer.cpp
@ -0,0 +1,210 @@
 //
 //  Initializer.cpp
 //  MNN
 //
 //  Created by MNN on 2019/11/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "Initializer.hpp"
 #include <MNN/expr/ExprCreator.hpp>
 #include <cmath>
 #include <vector>
 #include "Distributions.hpp"
 #include "RandomGenerator.hpp"
 namespace MNN {
 namespace Express {
 Express::VARP Initializer::createConstVar(Express::INTS dim, Express::Dimensionformat format) {
    auto res = Express::_Input(dim, format, halide_type_of<float>());
    this->onExecute(res);
    res.fix(Express::VARP::CONSTANT);
    return res;
 }
 class ConstantInitializer : public Initializer {
 public:
    ConstantInitializer(float value) : mConstant(value) {
    }
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        auto ptr = p->writeMap<float>();
        for (int i = 0; i < count; i++) {
            ptr[i] = mConstant;
        }
    }
 private:
    float mConstant;
 };
 Initializer* Initializer::constValue(float value) {
    return new ConstantInitializer(value);
 }
 class UniformInitializer : public Initializer {
 public:
    UniformInitializer(float min = 0, float max = 1) {
        mMin = min;
        mMax = max;
    }
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        Distributions::uniform(count, mMin, mMax, p->writeMap<float>(), RandomGenerator::generator());
    }
 private:
    float mMin;
    float mMax;
 };
 Initializer* Initializer::uniform(float minValue, float maxValue) {
    return new UniformInitializer(minValue, maxValue);
 }
 class XavierInitializer : public Initializer {
 public:
    XavierInitializer(VarianceNorm norm = FANIN) {
        mNorm = norm;
    }
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        const std::vector<int> dims = p->getInfo()->dim;
        // referenced from Caffe
        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
        int fanIn  = count / dims[0];
        int fanOut = dims.size() > 1 ? count / dims[1] : count;
        float n    = fanIn; // default: FANIN
        if (mNorm == VarianceNorm::AVERAGE) {
            n = (fanIn + fanOut) / 2.0f;
        } else if (mNorm == VarianceNorm::FANOUT) {
            n = fanOut;
        }
        float scale = sqrtf(3.0f / n);
        Distributions::uniform(count, -scale, scale, p->writeMap<float>(), RandomGenerator::generator());
    }
 private:
    VarianceNorm mNorm;
 };
 Initializer* Initializer::xavier(VarianceNorm norm) {
    return new XavierInitializer(norm);
 }
 class GaussianInitializer : public Initializer {
 public:
    GaussianInitializer(float mean = 0, float std = 1) {
        mMean = mean;
        mStd  = std;
    }
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        Distributions::gaussian(count, mMean, mStd, p->writeMap<float>(), RandomGenerator::generator());
    }
 private:
    float mMean;
    float mStd;
 };
 Initializer* Initializer::gauss(float mean, float std) {
    return new GaussianInitializer(mean, std);
 }
 class MSRAInitializer : public Initializer {
 public:
    MSRAInitializer(VarianceNorm norm = FANIN) {
        mNorm = norm;
    }
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        const std::vector<int> dims = p->getInfo()->dim;
        // referenced from Caffe
        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
        int fanIn  = count / dims[0];
        int fanOut = dims.size() > 1 ? count / dims[1] : count;
        float n    = fanIn; // default: FANIN
        if (mNorm == VarianceNorm::AVERAGE) {
            n = (fanIn + fanOut) / 2.0f;
        } else if (mNorm == VarianceNorm::FANOUT) {
            n = fanOut;
        }
        float std = sqrtf(2.0f / n);
        Distributions::gaussian(count, 0.0f, std, p->writeMap<float>(), RandomGenerator::generator());
    }
 private:
    VarianceNorm mNorm;
 };
 Initializer* Initializer::MSRA(VarianceNorm norm) {
    return new MSRAInitializer(norm);
 }
 class BilinearInitializer : public Initializer {
 public:
    BilinearInitializer() = default;
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        const std::vector<int> dims = p->getInfo()->dim;
        MNN_ASSERT(dims.size() == 4);
        MNN_ASSERT(dims[2] == dims[3]); // NCHW, H == W
        // referenced from Caffe
        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
        int f   = ceilf(dims[3] / 2.0f);
        float c = (dims[3] - 1) / (2.0f * f);
        auto ptr = p->writeMap<float>();
        for (int i = 0; i < count; i++) {
            float x                 = i % dims[3];
            float y                 = (i / dims[3]) % dims[2];
            ptr[i] = (1 - std::fabs(x / f - c)) * (1 - std::fabs(y / f - c));
        }
    }
 };
 Initializer* Initializer::bilinear() {
    return new BilinearInitializer();
 }
 class PositiveUnitball : public Initializer {
 public:
    PositiveUnitball() = default;
    virtual void onExecute(Express::VARP p) override {
        const int count = p->getInfo()->size;
        MNN_ASSERT(count > 0);
        const std::vector<int> dims = p->getInfo()->dim;
        auto ptr = p->writeMap<float>();
        Distributions::uniform(count, 0, 1, ptr, RandomGenerator::generator());
        int dim = count / dims[0];
        for (int i = 0; i < dims[0]; i++) {
            float sum = 0;
            for (int j = 0; j < dim; j++) {
                sum += ptr[i * dim + j];
            }
            for (int j = 0; j < dim; j++) {
                ptr[i * dim + j] = ptr[i * dim + j] / sum;
            }
        }
    }
 };
 Initializer* Initializer::positiveUnitball() {
    return new PositiveUnitball();
 }
 } // namespace Express
 } // namespace MNN
--- a/express/Initializer.hpp
+++ b/express/Initializer.hpp
@ -0,0 +1,43 @@
 //
 //  Initializer.hpp
 //  MNN
 //
 //  Created by MNN on 2019/11/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef Initializer_hpp
 #define Initializer_hpp
 #include <MNN/expr/Expr.hpp>
 namespace MNN {
 namespace Express {
 class RandomGenerator;
 class MNN_PUBLIC Initializer {
 public:
    Initializer()          = default;
    virtual ~Initializer() = default;
    Express::VARP createConstVar(Express::INTS dim, Express::Dimensionformat format = Express::NCHW);
    virtual void onExecute(Express::VARP p) = 0;
    static Initializer* constValue(float value);
    static Initializer* uniform(float minValue = 0.0f, float maxValue = 1.0f);
    enum VarianceNorm {
        FANIN,
        FANOUT,
        AVERAGE,
    };
    static Initializer* xavier(VarianceNorm norm = FANIN);
    static Initializer* gauss(float mean = 0.0f, float std = 1.0f);
    static Initializer* MSRA(VarianceNorm norm = FANIN);
    static Initializer* bilinear();
    static Initializer* positiveUnitball();
 };
 } // namespace Express
 } // namespace MNN
 #endif // Initializer_hpp
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -30,7 +30,18 @@ static DataType _convertDataType(halide_type_t type) {
    }
    return DataType_DT_INVALID;
 }
 static VARP _checkNC4HW4(VARP x) {
 #ifdef MNN_EXPR_SHAPE_EAGER
    auto info = x->getInfo();
    if (nullptr != info && info->order == NC4HW4) {
        return _Convert(x, NCHW);
    }
 #endif
    return x;
 }
 static VARP _Binary(VARP x, VARP y, BinaryOpOperation operation) {
    x = _checkNC4HW4(x);
    y = _checkNC4HW4(y);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                 = OpParameter_BinaryOp;
    op->type                      = OpType_BinaryOp;
@ -49,6 +60,7 @@ static VARP _Unary(VARP x, UnaryOpOperation operation) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -60,6 +72,7 @@ static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _ReduceMutable(VARP x, VARP dim, ReductionType type, bool keepDim) {
    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -955,6 +968,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMax(VARP input, int axis) {
    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMax;
@ -976,6 +990,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMin(VARP input, int axis) {
    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMin;
--- a/express/MergeOptimizer.hpp
+++ b/express/MergeOptimizer.hpp
@ -5,6 +5,7 @@
 //  Created by MNN on 2019/08/20.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef MergeOptimizer_hpp
 #define MergeOptimizer_hpp
--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@ -54,16 +54,14 @@ VARP _Input(INTS shape, Dimensionformat data_format, halide_type_t dtype) {
    info.dim = std::move(shape);
    info.order = data_format;
    info.type = dtype;
-    info.ptr = nullptr;
+    return (Variable::create(Expr::create(std::move(info), nullptr, VARP::INPUT)));
    return (Variable::create(Expr::create(std::move(info))));
 }
 VARP _Scalar(const void* ptr, halide_type_t type) {
    Variable::Info info;
    info.dim = {};
    info.order = NHWC;
    info.type = type;
-    info.ptr = (void*)ptr;
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
    return (Variable::create(Expr::create(std::move(info))));
 }
 /*create a constant variable.
 Args:
@ -79,8 +77,7 @@ VARP _Const(const void* ptr, INTS shape, Dimensionformat format, halide_type_t t
    info.dim = std::move(shape);
    info.order = format;
    info.type = type;
-    info.ptr = (void*)ptr;
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
    return (Variable::create(Expr::create(std::move(info))));
 }
 VARP _Const(float value, INTS shape, Dimensionformat format) {
@ -93,8 +90,8 @@ VARP _Const(float value, INTS shape, Dimensionformat format) {
    for (int i = 0; i < info.size; ++i) {
        values[i] = value;
    }
-    info.ptr = (void*)values.data();
+    auto ptr = (void*)values.data();
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }
 VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format, halide_type_t type) {
@ -107,6 +104,23 @@ VARP _TrainableParam(float value, INTS dims, Dimensionformat format) {
    v.fix(VARP::TRAINABLE);
    return v;
 }
 VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape) {
    std::unique_ptr<OpT> ipOp(new OpT);
    ipOp->type = OpType_InnerProduct;
    ipOp->main.type  = OpParameter_InnerProduct;
    ipOp->main.value = new InnerProductT;
    auto ipParam        = ipOp->main.AsInnerProduct();
    ipParam->outputCount = outputShape[1];
    if(!bias.empty()) {
        ipParam->biasTerm = 1;
    }
    ipParam->weightSize = weight.size();
    ipParam->weight = std::move(weight);
    ipParam->bias = std::move(bias);
    return (Variable::create(Expr::create(ipOp.get(), {x})));
 }
 VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads) {
    std::unique_ptr<OpT> convOp(new OpT);
@ -183,7 +197,7 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }
 VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Convolution;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -285,6 +299,42 @@ VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS
    return (Variable::create(Expr::create(std::move(convOp), {x, weight})));
 }
 VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Deconvolution;
    if (channel[0] == channel[1] && channel[0] == group) {
        convOp->type = OpType_DeconvolutionDepthwise;
    }
    convOp->main.type  = OpParameter_Convolution2D;
    convOp->main.value = new Convolution2DT;
    auto conv2D        = convOp->main.AsConvolution2D();
    conv2D->common.reset(new Convolution2DCommonT);
    conv2D->common->padMode     = _convertPadMode(pad);
    if (pads.size() == 2) {
        conv2D->common->padX        = pads[0];
        conv2D->common->padY        = pads[1];
    } else {
        conv2D->common->pads = std::move(pads);
    }
    conv2D->common->strideX     = stride[0];
    conv2D->common->strideY     = stride[1];
    conv2D->common->group       = group;
    conv2D->common->outputCount = channel[1];
    conv2D->common->inputCount  = channel[0];
    conv2D->common->dilateX     = dilate[0];
    conv2D->common->dilateY     = dilate[1];
    conv2D->common->kernelX     = kernelSize[0];
    conv2D->common->kernelY     = kernelSize[1];
    conv2D->common->relu6 = relu6;
    conv2D->common->relu = relu;
    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
    conv2D->weight = std::move(weight);
    MNN_ASSERT(bias.size() == channel[1]);
    conv2D->bias = std::move(bias);
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }
 static VARP _Pool(VARP x, INTS kernel, INTS stride, PoolType type, PaddingMode pad, INTS pads) {
    std::unique_ptr<OpT> pool(new OpT);
    pool->type       = OpType_Pooling;
@ -381,9 +431,13 @@ x: A variable.
 Returns:
 output: A variable with the same type as `x`.
 */
-VARP _Relu6(VARP x) {
+VARP _Relu6(VARP x, float minValue, float maxValue) {
    std::unique_ptr<OpT> relu(new OpT);
    relu->type = OpType_ReLU6;
    relu->main.value = new Relu6T;
    relu->main.type = OpParameter_Relu6;
    relu->main.AsRelu6()->maxValue = maxValue;
    relu->main.AsRelu6()->minValue = minValue;
    return (Variable::create(Expr::create(relu.get(), {x})));
 }
 /*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0. 
@ -746,9 +800,12 @@ input: A variable.
 Returns:
 A variable of Halide_Type_Int.
 */ 
-VARP _Shape(VARP input) {
+VARP _Shape(VARP input, bool nchw) {
    std::unique_ptr<OpT> shape(new OpT);
    shape->type = OpType_Shape;
    if (nchw) {
        shape->defaultDimentionFormat = MNN_DATA_FORMAT_NCHW;
    }
    return (Variable::create(Expr::create(std::move(shape), {input})));
 }
 /*Stacks a list of rank-R variables into one rank-(R+1) variable.
@ -906,6 +963,21 @@ VARP _Elu(VARP features, float alpha) {
    op->main.value = eluParam;
    return (Variable::create(Expr::create(std::move(op), {features})));
 }
 /*Given an input value x, it computes the output as 1.0 if x > threshold and 0.0 if x <= threshold.
 features: A variable of type Halide_Type_Float
 threshold: threshold value
 Returns:
 A variable. Has the same type as features.
 */
 VARP _Threshold(VARP features, float threshold) {
    std::unique_ptr<OpT> op(new OpT);
    op->type = OpType_Threshold;
    auto eluParam = new ELUT;
    op->main.type = OpParameter_ELU;
    eluParam->alpha = threshold;
    op->main.value = eluParam;
    return (Variable::create(Expr::create(std::move(op), {features})));
 }
 /*Computes the size of the variable
 Args:
 input: A variable of type Halide_Type_Float or Halide_Type_Int
@ -1049,7 +1121,6 @@ std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims) {
    op->main.type = OpParameter_MomentsParam;
    momentsParam->dim = axis;
    momentsParam->keepDims = keepDims;
    momentsParam->dType = (MNN::DataType)Utils::convertDataType(x->getInfo()->type);
    op->main.value = momentsParam;
    EXPRP expr = Expr::create(std::move(op), {x}, 2);
    std::vector<VARP> res;
@ -1405,7 +1476,7 @@ VARP _ZeroGrad(VARP x) {
 }
 VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu) {
+                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_ConvInt8;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -1433,9 +1504,16 @@ VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<fl
    conv2D->symmetricQuan->bias = std::move(bias);
    conv2D->symmetricQuan->scale = std::move(scale);
    conv2D->symmetricQuan->weight = std::move(weight);
    conv2D->symmetricQuan->nbits = nbits;
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }
 VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim) {
    std::unique_ptr<MNN::OpT> cosineSimilarityOp(new MNN::OpT);
    cosineSimilarityOp->type = MNN::OpType_CosineSimilarity;
    return (Variable::create(Expr::create(std::move(cosineSimilarityOp), {input0, input1, inputDim})));
 }
 VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) {
    auto xInfo = x->getInfo();
    auto scaleInfo = scale->getInfo();
--- a/express/Optimizer.cpp
+++ b/express/Optimizer.cpp
@ -22,28 +22,7 @@ Optimizer::Parameters::~Parameters() {
    }
 }
 std::shared_ptr<Optimizer> Optimizer::create(Config config) {
-    const int numThread = config.numThread;
+    // Do nothing
    auto forwardType = config.forwardType;
    if (forwardType != MNN_FORWARD_ALL) {
        if (MNNGetExtraBackendCreator(forwardType) == nullptr) {
            return nullptr;
        }
        return std::shared_ptr<Optimizer>(new MergeOptimizer(config.forwardType, numThread, nullptr));
    }
    auto device = config.device;
    if (CPU == device) {
        return std::shared_ptr<Optimizer>(new MergeOptimizer(MNN_FORWARD_CPU, numThread, nullptr));
    }
    if (GPU == device) {
        std::vector<MNNForwardType> types {MNN_FORWARD_METAL, MNN_FORWARD_OPENCL, MNN_FORWARD_VULKAN, MNN_FORWARD_OPENGL};
        for (auto type : types) {
            auto creator = MNNGetExtraBackendCreator(type);
            if (nullptr != creator) {
                return std::shared_ptr<Optimizer>(new MergeOptimizer(type, numThread, nullptr));
            }
        }
    }
    return nullptr;
 }
--- a/express/RandomGenerator.hpp
+++ b/express/RandomGenerator.hpp
@ -0,0 +1,45 @@
 //
 //  RandomGenerator.hpp
 //  MNN
 //
 //  Created by MNN on 2019/11/28.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef RandomGenerator_hpp
 #define RandomGenerator_hpp
 #include <MNN/MNNDefine.h>
 #include <random>
 namespace MNN {
 namespace Express {
 class MNN_PUBLIC RandomGenerator {
 private:
    RandomGenerator(int seed = std::random_device()()) {
        mSeed = seed;
        mGenerator.seed(mSeed);
    }
    ~RandomGenerator() = default;
    RandomGenerator(RandomGenerator &);
    RandomGenerator &operator=(const RandomGenerator &);
 private:
    int mSeed;
    std::mt19937 mGenerator;
 public:
    static std::mt19937 &generator(int seed = std::random_device()()) {
        static RandomGenerator rng(seed);
        return rng.mGenerator;
    }
 };
 } // namespace Express
 } // namespace MNN
 #endif // RandomGenerator_hpp
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@ -10,8 +10,24 @@
 #include <map>
 #include "MNN_generated.h"
 #include "core/TensorUtils.hpp"
 #include "core/MNNMemoryUtils.h"
 namespace MNN {
 namespace Express {
 Expr::Inside::Inside(int outputSize) {
    mOutputInfos.resize(outputSize);
    mOutputTensors.resize(outputSize);
    for (int i=0; i<outputSize; ++i) {
        mOutputTensors[i] = new Tensor;
        TensorUtils::getDescribe(mOutputTensors[i])->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
    }
 }
 Expr::Inside::~Inside() {
    for (auto t : mOutputTensors) {
        delete t;
    }
 }
 #define CONVERT(src, dst, f)\
 if (f == src) return dst;
@ -61,7 +77,6 @@ void Utils::copyInfoToTensor(Tensor* dest, const Variable::Info* source) {
    }
    dest->buffer().dimensions                       = (int)source->dim.size();
    dest->buffer().type                             = source->type;
    dest->buffer().host                             = (uint8_t*)source->ptr;
    TensorUtils::getDescribe(dest)->dimensionFormat = (MNN_DATA_FORMAT)Utils::convertFormat(source->order);
    TensorUtils::setLinearLayout(dest);
 }
@ -70,7 +85,31 @@ void Utils::copyTensorToInfo(Variable::Info* shape, const Tensor* tensor) {
    shape->dim   = tensor->shape();
    shape->size  = tensor->elementSize();
    shape->order = Utils::revertFormat(TensorUtils::getDescribe(tensor)->dimensionFormat);
-    shape->ptr   = tensor->host<float>();
+}
 bool Utils::allocMemoryForHostTensor(Tensor* dest) {
    if (nullptr != dest->buffer().host) {
        return true;
    }
    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
        return false;
    }
    auto size = dest->size();
    if (0 >= size) {
        return false;
    }
    dest->buffer().host = (uint8_t*)MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
    return dest->buffer().host != nullptr;
 }
 bool Utils::releaseMemoryForHostTensor(Tensor* dest) {
    if (nullptr == dest->buffer().host) {
        return true;
    }
    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
        return false;
    }
    MNNMemoryFreeAlign(dest->buffer().host);
    dest->buffer().host = nullptr;
    return true;
 }
 } // namespace Express
--- a/express/Utils.hpp
+++ b/express/Utils.hpp
@ -15,15 +15,16 @@
 namespace MNN {
 namespace Express {
 struct Expr::Inside {
-    std::vector<const Variable::Info*> mInputInfos;
+    Inside(int outputSize);
    ~ Inside();
    std::vector<Variable::Info> mOutputInfos;
    std::vector<Tensor*> mOutputTensors;
    Executor::Requirement mReq;
-    std::shared_ptr<Executor::ComputeCache::Unit> mUnit;
+    std::shared_ptr<Executor::Unit> mUnit;
    std::shared_ptr<Executor::ComputeCache> mCache;
    int mCacheOffset = 0;
    bool mInfoDirty = true;
    bool mContentDirty = true;
    bool mLinkCache = false;
 };
 class Utils {
 public:
@ -33,6 +34,8 @@ public:
    static int convertFormat(Dimensionformat format);
    static Express::Dimensionformat revertFormat(int format);
    static halide_type_t revertDataType(DataType dataType);
    static bool allocMemoryForHostTensor(Tensor* dest);
    static bool releaseMemoryForHostTensor(Tensor* dest);
 };
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.cpp
+++ b/tools/train/source/module/FixModule.cpp
@ -10,7 +10,7 @@
 #include <MNN/expr/ExprCreator.hpp>
 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 FixModule::FixModule(std::vector<Express::VARP> output, std::vector<Express::VARP> parameters,
                     std::vector<std::pair<Express::VARP, Express::Dimensionformat>> inputs) {
    for (auto p : parameters) {
@ -34,5 +34,19 @@ std::vector<Express::VARP> FixModule::onForward(const std::vector<Express::VARP>
    }
    return mOutput;
 }
-} // namespace Train
+
 Module* FixModule::clone(CloneContext* ctx) const {
    FixModule* module(new FixModule);
    for (auto& it : mInputs) {
        VARP v = ctx->getOrClone(it.first);
        module->mInputs.push_back(std::make_pair(v, it.second));
    }
    for (auto& it : mOutput) {
        VARP v = ctx->getOrClone(it);
        module->mOutput.push_back(v);
    }
    return this->cloneBaseTo(ctx, module);
 }
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.hpp
+++ b/tools/train/source/module/FixModule.hpp
@ -8,9 +8,9 @@
 #ifndef FixModule_hpp
 #define FixModule_hpp
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {
 class FixModule : public Module {
 public:
@ -20,10 +20,14 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
    virtual void onClearCache() override;
 private:
    FixModule() = default;
    Module* clone(CloneContext* ctx) const override;
    std::vector<std::pair<Express::VARP, Express::Dimensionformat>> mInputs;
    std::vector<Express::VARP> mOutput;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN
 #endif
--- a/express/module/IfModule.cpp
+++ b/express/module/IfModule.cpp
@ -0,0 +1,112 @@
 //
 //  IfModule.cpp
 //  MNN
 //
 //  Created by MNN on 2020/09/01.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "IfModule.hpp"
 #include "MNN_generated.h"
 namespace MNN {
 namespace Express {
 static int _findPos(const std::vector<std::string>& names, const std::string& key) {
    for (int i=0; i<names.size(); ++i) {
        if (names[i] == key) {
            return i;
        }
    }
    return -1;
 }
 std::vector<Express::VARP> IfModule::onForward(const std::vector<Express::VARP>& inputs) {
    std::vector<Express::VARP> outputs(mOutputFromElse.size());
    MNN_ASSERT(mOutputFromThen.size() == mOutputFromElse.size());
    if (inputs[0]->readMap<int>()[0] > 0) {
        std::vector<Express::VARP> subInputs(mInputForThen.size());
        for (auto& p : mInputForThen) {
            subInputs[p.first] = inputs[p.second];
        }
        auto subOutputs = mThen->onForward(subInputs);
        for (int i=0; i<mOutputFromThen.size(); ++i) {
            outputs[i] = subOutputs[mOutputFromThen[i]];
        }
    } else {
        std::vector<Express::VARP> subInputs(mInputForElse.size());
        for (auto& p : mInputForElse) {
            subInputs[p.first] = inputs[p.second];
        }
        auto subOutputs = mElse->onForward(subInputs);
        for (int i=0; i<mOutputFromElse.size(); ++i) {
            outputs[i] = subOutputs[mOutputFromElse[i]];
        }
    }
    return outputs;
 }
 IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
    auto module = new IfModule;
    auto ifParam = op->main_as_IfParam();
    auto& thenG = subGraph.find(ifParam->then_graph()->str())->second;
    auto& elseG = subGraph.find(ifParam->else_graph()->str())->second;
    module->mElse = elseG.m;
    module->mThen = thenG.m;
    if (nullptr != op->name()) {
        module->setName(op->name()->str());
    }
    /** Compute map index
     std::vector<std::pair<int, int>> mInputForThen;
     // First mElse' index, Second: inputs's index
     std::vector<std::pair<int, int>> mInputForElse;
     std::vector<int> mOutputFromThen;
     std::vector<int> mOutputFromElse;
     */
    // Map Inputs
    for (int i=0; i<ifParam->aliases_inputs()->size(); ++i) {
        auto index = i;
        auto data = ifParam->aliases_inputs()->GetAs<StringVec>(i);
        if (nullptr == data->data()) {
            continue;
        }
        for (int s=0; s<data->data()->size(); ++s) {
            auto name = data->data()->GetAsString(s)->str();
            auto thenPos = _findPos(thenG.inputs, name);
            if (thenPos >= 0) {
                module->mInputForThen.emplace_back(std::make_pair(thenPos, i));
            }
            auto elsePos = _findPos(elseG.inputs, name);
            if (elsePos >= 0) {
                module->mInputForElse.emplace_back(std::make_pair(elsePos, i));
            }
        }
    }
    // Map outputs
    auto output = ifParam->aliases_outputs();
    module->mOutputFromThen.resize(output->size());
    module->mOutputFromElse.resize(output->size());
    for (int i=0; i<output->size(); ++i) {
        auto data = output->GetAs<StringVec>(i);
        MNN_ASSERT(data->data()->size() == 2);
        auto thenPos = _findPos(thenG.outputs, data->data()->GetAsString(0)->str());
        MNN_ASSERT(thenPos >= 0);
        auto elsePos = _findPos(elseG.outputs, data->data()->GetAsString(1)->str());
        module->mOutputFromThen[i] = thenPos;
        module->mOutputFromElse[i] = elsePos;
    }
    return module;
 }
 Module* IfModule::clone(CloneContext* ctx) const {
    IfModule* module(new IfModule);
    module->mInputForThen = mInputForThen;
    module->mInputForElse = mInputForElse;
    module->mOutputFromThen = mOutputFromThen;
    module->mOutputFromElse = mOutputFromElse;
    module->mThen.reset(mThen->clone(ctx));
    module->mElse.reset(mElse->clone(ctx));
    return this->cloneBaseTo(ctx, module);
 }
 }  // namespace Express
 }  // namespace MNN
--- a/express/module/IfModule.hpp
+++ b/express/module/IfModule.hpp
@ -0,0 +1,43 @@
 //
 //  IfModule.hpp
 //  MNN
 //
 //  Created by MNN on 2020/09/01.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef IfModule_hpp
 #define IfModule_hpp
 #include <MNN/expr/Module.hpp>
 namespace MNN {
 namespace Express {
 class IfModule : public Module {
 public:
    virtual ~ IfModule() {
        // Do nothing
    }
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
    static IfModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
 private:
    IfModule(){}
    Module* clone(CloneContext* ctx) const override;
    // First mThen' index, Second: inputs's index
    std::vector<std::pair<int, int>> mInputForThen;
    // First mElse' index, Second: inputs's index
    std::vector<std::pair<int, int>> mInputForElse;
    std::vector<int> mOutputFromThen;
    std::vector<int> mOutputFromElse;
    std::shared_ptr<Module> mThen;
    std::shared_ptr<Module> mElse;
 };
 }
 }
 #endif /* IfModule_hpp */
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -0,0 +1,182 @@
 //
 //  Module.cpp
 //  MNN
 //
 //  Created by MNN on 2019/11/25.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include <MNN/expr/Module.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include "FixModule.hpp"
 #include "PipelineModule.hpp"
 #include "core/FileLoader.hpp"
 namespace MNN {
 namespace Express {
 class EmptyModule : public Module {
 public:
    EmptyModule(const std::vector<Express::VARP>& parameters) {
        for (auto p : parameters) {
            addParameter(p);
        }
    }
    virtual ~EmptyModule() {
        // Do nothing
    }
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
        return {};
    }
 protected:
    EmptyModule() = default;
    Module* clone(Module::CloneContext* ctx) const override {
        EmptyModule* module(new EmptyModule);
        return this->cloneBaseTo(ctx, module);
    }
 };
 Module* Module::createEmpty(const std::vector<Express::VARP>& parameters) {
    return new EmptyModule(parameters);
 }
 Express::VARP Module::forward(Express::VARP input) {
    return this->onForward({input})[0];
 }
 std::vector<Express::VARP> Module::parameters() const {
    std::vector<Express::VARP> result;
    _collectParameters(result);
    return result;
 }
 bool Module::loadParameters(const std::vector<Express::VARP>& parameters) {
    std::vector<Express::VARP> result;
    _collectParameters(result);
    if (parameters.empty() || parameters.size() != result.size()) {
        MNN_ERROR("Error parameters, empty or parameter size not match \n");
        return false;
    }
    for (int i=0; i<parameters.size(); ++i) {
        if (nullptr != result[i].get()) {
            // Check Origin parameter's size
            auto dstInfo = result[i]->getInfo();
            auto srcInfo = parameters[i]->getInfo();
            if (dstInfo->dim.size() != srcInfo->dim.size() || dstInfo->order != srcInfo->order) {
                MNN_ERROR("Error parameters %d, dim size or order not match \n", i);
                return false;
            }
            if (dstInfo->size != srcInfo->size || dstInfo->type != srcInfo->type) {
                MNN_ERROR("Error parameters %d, size or type not match \n", i);
                return false;
            }
        }
        Variable::replace(result[i], parameters[i]);
    }
    return true;
 }
 void Module::setIsTraining(const bool isTraining) {
    mIsTraining = isTraining;
    for (auto c : mChildren) {
        c->setIsTraining(isTraining);
    }
 }
 bool Module::getIsTraining() {
    return mIsTraining;
 }
 void Module::registerModel(const std::vector<std::shared_ptr<Module>>& children) {
    mChildren.insert(mChildren.begin(), children.begin(), children.end());
 }
 int Module::addParameter(VARP parameter) {
    auto res = mParameters.size();
    mParameters.emplace_back(parameter);
    return (int)res;
 }
 void Module::setParameter(Express::VARP parameter, int index) {
    if (index < 0 || index >= mParameters.size()) {
        MNN_ERROR("Module error: index out of range: %d - %d:\n", index, (int)mParameters.size());
        return;
    }
    mParameters[index] = parameter;
 }
 void Module::_collectParameters(std::vector<Express::VARP>& result) const {
    for (auto p : mParameters) {
        result.push_back(p);
    }
    for (auto c : mChildren) {
        c->_collectParameters(result);
    }
 }
 void Module::clearCache() {
    for (auto c : mChildren) {
        c->clearCache();
    }
    this->onClearCache();
 }
 Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic) {
    AutoStorage<uint8_t> buffer;
    {
        FileLoader loader(fileName);
        if (!loader.valid()) {
            MNN_ERROR("Error for open %s\n", fileName);
            return {};
        }
        loader.read();
        if (!loader.valid()) {
            return {};
        }
        loader.merge(buffer);
        if (buffer.get() == nullptr) {
            return {};
        }
    }
    return load(inputs, outputs, buffer.get(), buffer.size(), dynamic);
 }
 Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
    return PipelineModule::load(inputs, outputs, buffer, length, dynamic);
 }
 EXPRP Module::CloneContext::getOrClone(EXPRP expr) {
    auto it = mExprMap.find(expr.get());
    if (it == mExprMap.end()) {
        // EXPRP replica = expr->clone(shareParams);
        // TODO(hjchen2): Clone expr.
        EXPRP replica = expr;
        it = mExprMap.emplace(expr.get(), replica).first;
    }
    return it->second;
 }
 VARP Module::CloneContext::getOrClone(VARP var) {
    auto it = mVarMap.find(var.get());
    if (it != mVarMap.end()) {
        // TODO(hjchen2): Clone variable.
        VARP replica = var;
        it = mVarMap.emplace(var.get(), replica).first;
    }
    return it->second;
 }
 Module* Module::clone(const Module* module, const bool shareParams) {
    CloneContext context(shareParams);
    return module->clone(&context);
 }
 Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
    for (const Express::VARP& var : mParameters) {
        module->mParameters.push_back(ctx->getOrClone(var));
    }
    module->mIsTraining = mIsTraining;
    module->mName = mName;
    module->mType = mType;
    return module;
 }
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/NN.cpp
+++ b/tools/train/source/module/NN.cpp
@ -6,9 +6,11 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
-#include "NN.hpp"
+#include <MNN/expr/NN.hpp>
 #include "Distributions.hpp"
 #include "FixModule.hpp"
 #include "WhileModule.hpp"
 #include "IfModule.hpp"
 #include "Initializer.hpp"
 #include "MNN_generated.h"
 #include "RandomGenerator.hpp"
@ -17,7 +19,7 @@
 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 static VARP _activate(VARP x, NN::ActivationFunctionType type) {
    switch (type) {
        case NN::None:
@ -58,6 +60,14 @@ public:
    }
 private:
    DropoutModule() = default;
    Module* clone(CloneContext* ctx) const override {
        DropoutModule* module(new DropoutModule);
        module->mDropRatio = mDropRatio;
        return this->cloneBaseTo(ctx, module);
    }
    float mDropRatio;
 };
@ -80,8 +90,8 @@ public:
        mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
+        mRunningVariancePos = addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningMeanPos = addParameter(mRunningMean);
        mReductionDims = {0, 2, 3};
        setType("BatchNorm");
    }
@ -110,8 +120,8 @@ public:
        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
+        mRunningVariancePos = addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningMeanPos = addParameter(mRunningMean);
        setType("BatchNorm");
    }
@ -156,9 +166,8 @@ public:
            mRunningVariance = _Const(mMomentum) * mRunningVariance + _Const(1 - mMomentum) * sampleVar;
            outputData->setName(name());
            outputData = _Convert(outputData, dimFormat);
-            Variable::prepareCompute({inputs[0], outputData, mRunningMean, mRunningVariance});
+            setParameter(mRunningMean, mRunningMeanPos);
-            mRunningMean.fix(Express::VARP::CONSTANT);
+            setParameter(mRunningVariance, mRunningVariancePos);
            mRunningVariance.fix(Express::VARP::CONSTANT);
            return {outputData};
        }
        auto rStd  = _Const(1.0f) / _Sqrt(mRunningVariance + _Const(mEps));
@ -180,12 +189,31 @@ public:
    }
 private:
    BatchNormModule() = default;
    Module* clone(CloneContext* ctx) const override {
        BatchNormModule* module(new BatchNormModule);
        module->mMomentum = mMomentum;
        module->mEps = mEps;
        module->mScale = ctx->getOrClone(mScale);
        module->mBias = ctx->getOrClone(mBias);
        module->mRunningMean = ctx->getOrClone(mRunningMean);
        module->mRunningVariance = ctx->getOrClone(mRunningVariance);
        module->mRunningMeanPos = mRunningMeanPos;
        module->mRunningVariancePos = mRunningVariancePos;
        module->mChannels = mChannels;
        module->mReductionDims = mReductionDims;
        return this->cloneBaseTo(ctx, module);
    }
    float mMomentum       = 0.99;
    float mEps            = 1e-5;
    VARP mScale           = nullptr;
    VARP mBias            = nullptr;
    VARP mRunningMean     = nullptr;
    VARP mRunningVariance = nullptr;
    int mRunningMeanPos = -1;
    int mRunningVariancePos = -1;
    int mChannels;
    std::vector<int> mReductionDims;
 };
@ -246,7 +274,18 @@ public:
        tempOutput->setName(name());
        return {tempOutput};
    }
 private:
    ConvModule() = default;
    Module* clone(CloneContext* ctx) const override {
        ConvModule* module(new ConvModule);
        module->mParameter = mParameter;
        module->mParameter.weight = ctx->getOrClone(mParameter.weight);
        module->mParameter.bias = ctx->getOrClone(mParameter.bias);
        return this->cloneBaseTo(ctx, module);
    }
    NN::ConvParameters mParameter;
 };
 static std::tuple<VARP, VARP, int> _initParameters(const NN::ConvOption& option, bool hasBias,
@ -533,7 +572,23 @@ public:
    }
 private:
-    const NN::ConvOption mOption;
+    ConvOctaveModule() = default;
    Module* clone(CloneContext* ctx) const override {
        ConvOctaveModule* module(new ConvOctaveModule);
        module->mOption = mOption;
        module->mLLW = ctx->getOrClone(mLLW);
        module->mLHW = ctx->getOrClone(mLHW);
        module->mHLW = ctx->getOrClone(mHLW);
        module->mHHW = ctx->getOrClone(mHHW);
        module->mLBias = ctx->getOrClone(mLBias);
        module->mHBias = ctx->getOrClone(mHBias);
        module->mSplitInput = mSplitInput;
        module->mGroup = mGroup;
        return this->cloneBaseTo(ctx, module);
    }
    NN::ConvOption mOption;
    VARP mLLW;
    VARP mLHW;
    VARP mHLW;
@ -555,7 +610,7 @@ Module* NN::ConvOctave(const ConvParameters& parameters,
    module->setName(parameters.name);
    return module;
 }
-Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
+Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs) {
    if (nullptr == expr->get()) {
        return nullptr;
    }
@ -565,6 +620,12 @@ Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
    if (expr->get()->type() == OpType_Dropout) {
        return new DropoutModule(0.3f);
    }
    if (expr->get()->type() == OpType_While) {
        return WhileModule::create(expr->get(), subgraphs);
    }
    if (expr->get()->type() == OpType_If) {
        return IfModule::create(expr->get(), subgraphs);
    }
    return nullptr;
 }
@ -622,6 +683,9 @@ public:
        mLimitScale = _Scalar<float>(1.0f / limit);
        mClampValue = _Scalar<float>(limit);
        mInputScalePos = addParameter(mInputScale);
        mOutputScalePos = addParameter(mOutputScale);
        setType("ConvBNReluFused");
    }
@ -632,31 +696,16 @@ public:
            tempX = _Convert(tempX, NCHW);
        }
        auto originX = tempX;
-        VARP scale;
+        VARP scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
        if (mFeatureScaleStatMethod == NN::PerTensor) {
            scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
        } else {
            auto originSize = originX->getInfo()->size;
            auto batch = originX->getInfo()->dim[0];
            auto channel = originX->getInfo()->dim[1];
            if (originSize / batch / channel < 10) {
                // Too small data
                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
                std::vector<int> dims = {1, channel, 1, 1};
                auto dimVar = _Const(dims.data(), {4}, NCHW, halide_type_of<int32_t>());
                auto singleScale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
                scale = _Fill(dimVar, singleScale);
            } else {
                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
                scale = _Maximum(_ReduceMax(_Abs(tempX), {0, 2, 3}, true), _Scalar<float>(0.0001f)) * mLimitScale;
            }
        }
        scale.fix(VARP::CONSTANT);
        if (useScale == nullptr) {
            tempX = _Round(tempX * _Reciprocal(scale)) * scale;
        } else {
            tempX = _Round(tempX * _Reciprocal(useScale)) * useScale;
        }
        // Break the grad by use cast
        tempX = _Cast<float>(tempX);
        // Move grad from tempX to originX
        tempX = _Convert(tempX + _ZeroGrad(originX), originFormat);
        return std::make_pair(tempX, scale);
    }
@ -684,18 +733,16 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
        VARP res;
        if (getIsTraining()) {
            Variable::prepareCompute({inputs[0]});
            auto x = _Convert(inputs[0], NCHW);
            // simulate weight quant
            auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
            weightScale.fix(VARP::CONSTANT);
            auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale;
            weightTemp = weightTemp + _ZeroGrad(mWeight);
            // simulate input quant to get original input scale
            auto inputPair  = fakeQuantFeature(x);
            mInputScale = updateScale(mInputScale, inputPair.second);
-            mInputScale.fix(VARP::CONSTANT);
+            setParameter(mInputScale, mInputScalePos);
            // simulate output quant to get original output scale
            res = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -709,10 +756,9 @@ public:
            res = _activate(res, mActivation);
            Variable::prepareCompute({conv, res});
            auto outputPair = fakeQuantFeature(res);
            mOutputScale = updateScale(mOutputScale, outputPair.second);
-            mOutputScale.fix(VARP::CONSTANT);
+            setParameter(mOutputScale, mOutputScalePos);
            res = outputPair.first;
        } else {
            if (nullptr == mInputScale) {
@ -725,6 +771,7 @@ public:
                auto x = _Convert(inputs[0], NCHW);
                auto inputPair  = fakeQuantFeature(x);
                mInputScale     = inputPair.second;
                setParameter(mInputScale, mInputScalePos);
                inputPair.first.fix(VARP::CONSTANT);
                auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -737,6 +784,7 @@ public:
                Variable::prepareCompute({simuRes});
                auto outputPair = fakeQuantFeature(simuRes);
                mOutputScale    = outputPair.second;
                setParameter(mOutputScale, mOutputScalePos);
                outputPair.first.fix(VARP::CONSTANT);
            }
@ -772,12 +820,7 @@ public:
            {
                std::vector<int> dims = {x->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
+                VARP channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
                if (mFeatureScaleStatMethod == NN::PerTensor) {
                    channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
                } else {
                    channelScale = _Reciprocal(mInputScale);
                }
                x = _FloatToInt8(x, channelScale, -127, 127);// TODO add clamp
            }
@ -824,12 +867,7 @@ public:
            {
                std::vector<int> dims = {res->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
+                VARP channelScale = _Fill(dimVar, mOutputScale);
                if (mFeatureScaleStatMethod == NN::PerTensor) {
                    channelScale = _Fill(dimVar, mOutputScale);
                } else {
                    channelScale = mOutputScale;
                }
                res  = _Int8ToFloat(res, channelScale);
            }
        }
@ -838,6 +876,34 @@ public:
    }
 private:
    ConvBNReluFusedModule() = default;
    Module* clone(CloneContext* ctx) const override {
        ConvBNReluFusedModule* module(new ConvBNReluFusedModule);
        module->mConvParameter = mConvParameter;
        module->mConvParameter.weight = ctx->getOrClone(mConvParameter.weight);
        module->mConvParameter.bias = ctx->getOrClone(mConvParameter.bias);
        module->mOption = mOption;
        module->mGroup = mGroup;
        module->mWeight = ctx->getOrClone(mWeight);
        module->mBias = ctx->getOrClone(mBias);
        module->mActivation = mActivation;
        module->mLimitScale = ctx->getOrClone(mLimitScale);
        module->mInputScalePos = mInputScalePos;
        module->mOutputScalePos = mOutputScalePos;
        module->mInputScale = ctx->getOrClone(mInputScale);
        module->mOutputScale = ctx->getOrClone(mOutputScale);
        module->mClampValue = ctx->getOrClone(mClampValue);
        module->mMomentum = mMomentum;
        module->mFeatureScaleStatMethod = mFeatureScaleStatMethod;
        module->mScaleUpdateMethod = mScaleUpdateMethod;
        if (mBatchNorm) {
            module->mBatchNorm.reset(mBatchNorm->clone(ctx));
            module->registerModel({module->mBatchNorm});
        }
        return this->cloneBaseTo(ctx, module);
    }
    NN::ConvParameters mConvParameter;
    NN::ConvOption mOption;
    int mGroup;
@ -846,6 +912,8 @@ private:
    NN::ActivationFunctionType mActivation = NN::ActivationFunctionType::None;
    std::shared_ptr<Module> mBatchNorm = nullptr;
    VARP mLimitScale;
    int mInputScalePos = -1;
    int mOutputScalePos = -1;
    VARP mInputScale = nullptr;
    VARP mOutputScale = nullptr;
    VARP mClampValue;
@ -870,5 +938,5 @@ Module* NN::ConvInt8(const ConvParameters& para, int bits, NN::FeatureScaleStatM
    return new ConvBNReluFusedModule({conv}, featureMethod, method, bits);
 }
-} // namespace Train
+} // namespace Express
 } // namespace MNN
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@ -0,0 +1,761 @@
 //
 //  PipelineModule.cpp
 //  MNN
 //
 //  Created by MNN on 2020/01/09.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "PipelineModule.hpp"
 #include "MNN_generated.h"
 #include <set>
 #include <vector>
 #include "StaticModule.hpp"
 #include "IfModule.hpp"
 #include "WhileModule.hpp"
 using namespace MNN::Express;
 namespace MNN {
 namespace Express {
 //#define DYNAMIC
 #define PIPELINE_MODULE "_pipeline_module__"
 class ExprModule : public Module {
 public:
    ExprModule(EXPRP expr) {
        mExpr   = expr;
        setName(expr->name());
        mInputs = expr->inputs();
        auto op = mExpr->get();
        if (op) {
            auto typeName = EnumNameOpType(op->type());
            setType(typeName);
        }
        for (int i = 0; i < mInputs.size(); ++i) {
            auto inputExpr = mInputs[i]->expr().first;
            if (inputExpr->get() != nullptr) {
                mInputs[i] = nullptr;
                mInputIndexes.emplace_back(i);
                continue;
            }
            switch (inputExpr->inputType()) {
                case VARP::INPUT:
                    mInputs[i] = nullptr;
                    mInputIndexes.emplace_back(i);
                    break;
                case VARP::CONSTANT:
                    break;
                case VARP::TRAINABLE:
                    addParameter(mInputs[i]);
                    break;
                default:
                    break;
            }
        }
    }
    virtual std::vector<VARP> onForward(const std::vector<VARP>& inputs) override {
        MNN_ASSERT(mInputIndexes.size() == inputs.size());
        if (nullptr == mExpr->get()) {
            return {Variable::create(mExpr)};
        }
        std::vector<VARP> tempInputs = mInputs;
        for (int i = 0; i < inputs.size(); ++i) {
            tempInputs[mInputIndexes[i]] = inputs[i];
        }
        std::vector<VARP> outputVars;
        auto newExpr = Expr::create(mExpr->extra(), std::move(tempInputs), mExpr->outputSize());
        newExpr->setName(mExpr->name());
        for (int i = 0; i < mExpr->outputSize(); ++i) {
            outputVars.emplace_back(Variable::create(newExpr, i));
        }
        return outputVars;
    }
    const std::vector<int>& inputIndexes() const {
        return mInputIndexes;
    }
 private:
    Module* clone(CloneContext* ctx) const override {
        ExprModule* module(new ExprModule(ctx->getOrClone(mExpr)));
        for (const VARP& var : mInputs) {
            module->mInputs.push_back(ctx->getOrClone(var));
        }
        module->mInputIndexes = mInputIndexes;
        return this->cloneBaseTo(ctx, module);
    }
    EXPRP mExpr;
    std::vector<VARP> mInputs;
    std::vector<int> mInputIndexes;
 };
 Module* PipelineModule::extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph) {
    std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(EXPRP)> transformFunction;
    if (fortrain) {
        transformFunction =
        [&subGraph](EXPRP source) {
            if (source->get() == nullptr) {
                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
            }
            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
            if (nullptr != m) {
                m->setName(source->name());
                return std::make_pair(std::vector<int>{}, m);
            }
            auto convExtracted = NN::Utils::ExtractConvolution(source);
            if (convExtracted.weight == nullptr) {
                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
            }
            std::shared_ptr<Module> module(NN::Conv(convExtracted));
            module->setName(source->name());
            return std::make_pair(std::vector<int>{0}, module);
        };
    } else {
        transformFunction = [&subGraph](EXPRP source) {
            if (source->get() == nullptr) {
                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
            }
            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
            if (nullptr != m) {
                m->setName(source->name());
                return std::make_pair(std::vector<int>{}, m);
            }
            return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
        };
    }
    return new PipelineModule(inputs, outputs, transformFunction);
 }
 PipelineModule::PipelineModule(std::vector<VARP> inputs, std::vector<VARP> outputs, const Transformer& transformFunction) {
    setType(PIPELINE_MODULE);
    std::vector<EXPRP> executeOrder;
    std::set<EXPRP> inputExpr;
    for (auto v : inputs) {
        inputExpr.insert(v->expr().first);
    }
    for (auto output : outputs) {
        Expr::visit(output->expr().first,
        [&executeOrder, &inputExpr](EXPRP expr) {
            if (expr->visited()) {
                return false;
            }
            if (inputExpr.find(expr)!= inputExpr.end()) {
                expr->setVisited(true);
                executeOrder.emplace_back(expr);
                return false;
            }
            return true;
        },
        [&executeOrder](EXPRP expr) {
            //FUNC_PRINT_ALL(var->name().c_str(), s);
            if (!expr->visited()) {
                executeOrder.emplace_back(expr);
                expr->setVisited(true);
            }
            return true;
        });
    }
    for (auto expr : executeOrder) {
        expr->setVisited(false);
    }
    // Set Indexes
    std::map<EXPRP, int> indexes;
    int currentIndexes = 0;
    for (auto expr : executeOrder) {
        indexes[expr] = currentIndexes;
        currentIndexes += expr->outputSize();
    }
    std::set<EXPRP> inputSets;
    mInputIndexes.clear();
    mStackSize = currentIndexes;
    for (auto v : inputs) {
        auto inputExpr = v->expr();
        mInputIndexes.emplace_back(indexes[inputExpr.first] + inputExpr.second);
        inputSets.insert(inputExpr.first);
    }
    // Create All SubModule
    for (auto expr : executeOrder) {
        if (inputSets.find(expr) != inputSets.end()) {
            continue;
        }
        std::pair<std::vector<int>, std::shared_ptr<Module> > moduleResult;
        bool extracted = false;
        if (!transformFunction) {
            moduleResult = std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
        } else {
            moduleResult = transformFunction(expr);
        }
        if (moduleResult.second == nullptr) {
            std::shared_ptr<Module> module(new ExprModule(expr));
            moduleResult.first  = ((ExprModule*)module.get())->inputIndexes();
            moduleResult.second = module;
        } else {
            extracted = true;
        }
        auto subInputs        = expr->inputs();
        auto& exprInputIndexes = moduleResult.first;
        std::vector<int> inputIndexes;
        if (exprInputIndexes.empty() && extracted) {
            inputIndexes.resize(subInputs.size());
            for (int i = 0; i < inputIndexes.size(); ++i) {
                auto inputExpr  = subInputs[i]->expr();
                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
            }
        } else {
            inputIndexes.resize(exprInputIndexes.size());
            for (int i = 0; i < inputIndexes.size(); ++i) {
                auto inputExpr  = subInputs[exprInputIndexes[i]]->expr();
                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
            }
        }
        std::vector<int> outputIndexes(expr->outputSize());
        for (int i = 0; i < outputIndexes.size(); ++i) {
            outputIndexes[i] = indexes[expr] + i;
        }
        mSubModules.emplace_back(std::make_tuple(moduleResult.second, inputIndexes, outputIndexes));
        registerModel({moduleResult.second});
    }
    mOutputIndexes.clear();
    for (auto output : outputs) {
        auto outputExpr = output->expr();
        mOutputIndexes.emplace_back(indexes[outputExpr.first] + outputExpr.second);
    }
 }
 bool PipelineModule::turnQuantize(Module* module, const int bit, NN::FeatureScaleStatMethod featureScaleStatMethod, NN::ScaleUpdateMethod scaleUpdateMethod) {
    if (nullptr == module || module->type() != PIPELINE_MODULE) {
        MNN_ERROR("Invalide module for quantized\n");
        return false;
    }
    ((PipelineModule*)module)->toTrainQuant(bit, featureScaleStatMethod, scaleUpdateMethod);
    return true;
 }
 std::vector<int> PipelineModule::countOutputReference(std::vector<int> outputIndices) {
    MNN_ASSERT(outputIndices.size() > 0);
    std::vector<int> countResult(outputIndices.size(), 0);
    for (int i = 0; i < mSubModules.size(); i++) {
        auto &m = mSubModules[i];
        auto& theModule = std::get<0>(m);
        auto name = theModule->name();
        auto &inputIndices = std::get<1>(m);
        for (int j = 0; j < inputIndices.size(); j++) {
            int index = inputIndices[j];
            for (int k = 0; k < countResult.size(); k++) {
                if (index == outputIndices[k]) {
                    countResult[k]++;
                }
            }
        }
    }
    return countResult;
 }
 void PipelineModule::toTrainQuant(const int bits, NN::FeatureScaleStatMethod featureScaleStatMethod,
                                        NN::ScaleUpdateMethod scaleUpdateMethod) {
    std::vector<int> needEraseIndices;
    for (int i = 0; i < mSubModules.size(); i++) {
        auto& m = mSubModules[i];
        auto& theModule = std::get<0>(m);
        auto moduleType = theModule->type();
        //auto& inputIndices = std::get<1>(m);
        auto& outputIndices = std::get<2>(m);
        if (moduleType == "Conv" && i < mSubModules.size() - 1) {
            auto& p1 = mSubModules[i+1];
            auto p1Module = std::get<0>(p1);
            auto& p1ModuleType = p1Module->type();
            auto& p1InputIndices = std::get<1>(p1);
            auto& p1OutputIndices = std::get<2>(p1);
            auto convOutputCount = countOutputReference(outputIndices);
            bool convSingleOutputReference = ((outputIndices.size() == 1) && (convOutputCount[0] == 1));
            // only conv
            if ((!convSingleOutputReference) || (p1ModuleType == "Conv") ||
                    (p1ModuleType != "BatchNorm" && p1ModuleType != "ReLU" && p1ModuleType != "ReLU6")) {
                theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
                registerModel({theModule});
                continue;
            }
            // conv + bn + ?
            if (p1ModuleType == "BatchNorm") {
                bool convBnConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
                if (!convBnConnected) {
                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
                    registerModel({theModule});
                    continue;
                }
                // last conv + bn
                if (i == mSubModules.size() - 2) {
                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
                    registerModel({theModule});
                    outputIndices = p1OutputIndices;
                    needEraseIndices.emplace_back(i + 1);
                    continue;
                }
                // maybe there is a relu or relu6 after conv + bn
                auto& p2 = mSubModules[i+2];
                auto& p2Module = std::get<0>(p2);
                auto p2ModuleType = p2Module->type();
                auto& p2InputIndices = std::get<1>(p2);
                auto& p2OutputIndices = std::get<2>(p2);
                auto bnOutputCount = countOutputReference(p1OutputIndices);
                bool bnSingleOutputReference = ((p1OutputIndices.size() == 1) && (bnOutputCount[0] == 1));
                // only conv + bn
                if ((!bnSingleOutputReference) || (p2ModuleType != "ReLU" && p2ModuleType != "ReLU6")) {
                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
                    registerModel({theModule});
                    outputIndices = p1OutputIndices;
                    needEraseIndices.emplace_back(i + 1);
                    continue;
                } else { // conv + bn + relu or conv + bn + relu6
                    bool convBnReluConnected = ((bnSingleOutputReference) && (p2InputIndices.size() == 1) && (p2InputIndices[0] == p1OutputIndices[0]));
                    if (!convBnReluConnected) {
                        theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
                        registerModel({theModule});
                        outputIndices = p1OutputIndices;
                        needEraseIndices.emplace_back(i + 1);
                        continue;
                    }
                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module, p2Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
                    registerModel({theModule});
                    outputIndices = p2OutputIndices;
                    needEraseIndices.emplace_back(i + 1);
                    needEraseIndices.emplace_back(i + 2);
                    continue;
                }
            }
            // conv + relu or conv + relu6
            if (p1ModuleType == "ReLU" || p1ModuleType == "ReLU6") {
                bool convReluConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
                if (!convReluConnected) {
                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
                    registerModel({theModule});
                    continue;
                }
                theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
                registerModel({theModule});
                outputIndices = p1OutputIndices;
                needEraseIndices.emplace_back(i + 1);
                continue;
            }
        }
        if (i == mSubModules.size() - 1 && moduleType == "Conv") {
            theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
            registerModel({theModule});
        }
    }
    // erase useless submodules
    const int eraseSize = needEraseIndices.size();
    int alreadyErasedCount = 0;
    for (int i = 0; i < eraseSize; i++) {
        auto position = needEraseIndices[i] - alreadyErasedCount;
        auto type = std::get<0>(mSubModules[position])->type();
        MNN_ASSERT(type == "BatchNorm" || type == "ReLU" || type == "ReLU6");
        mSubModules.erase(mSubModules.begin() + position);
        alreadyErasedCount++;
    }
 }
 std::vector<VARP> PipelineModule::onForward(const std::vector<VARP>& inputs) {
    std::vector<VARP> mStack(mStackSize);
    for (int i = 0; i < mInputIndexes.size(); ++i) {
        mStack[mInputIndexes[i]] = inputs[i];
    }
    for (int index = 0; index < mSubModules.size(); ++index) {
        auto& m = mSubModules[index];
        std::vector<VARP> tempInputs(std::get<1>(m).size());
        for (int i = 0; i < tempInputs.size(); ++i) {
            tempInputs[i] = mStack[std::get<1>(m)[i]];
            MNN_ASSERT(nullptr != tempInputs[i]);
        }
        std::vector<VARP> tempOutputs = std::get<0>(m)->onForward(tempInputs);
        MNN_ASSERT(tempOutputs.size() == std::get<2>(m).size());
        for (int i = 0; i < tempOutputs.size(); ++i) {
            mStack[std::get<2>(m)[i]] = tempOutputs[i];
            MNN_ASSERT(nullptr != tempOutputs[i]);
        }
    }
    std::vector<VARP> outputs(mOutputIndexes.size());
    for (int i = 0; i < mOutputIndexes.size(); ++i) {
        outputs[i] = mStack[mOutputIndexes[i]];
    }
    return outputs;
 }
 void PipelineModule::onClearCache() {
    // Do nothing
 }
 static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool dynamic) {
    std::map<std::string, SubGraph> subGraphMap;
    auto subGraphs = net->subgraphs();
    if (nullptr == subGraphs) {
        return subGraphMap;
    }
    for (int i=0; i<subGraphs->size(); ++i) {
        auto graph = subGraphs->GetAs<SubGraphProto>(i);
        std::vector<std::string> subInputs;
        std::vector<std::string> subOutputs;
        if (nullptr != graph->inputs()) {
            for (int v=0; v<graph->inputs()->size(); ++v) {
                auto index = graph->inputs()->data()[v];
                subInputs.emplace_back(graph->tensors()->GetAsString(index)->str());
            }
        }
        for (int v=0; v<graph->outputs()->size(); ++v) {
            auto index = graph->outputs()->data()[v];
            subOutputs.emplace_back(graph->tensors()->GetAsString(index)->str());
        }
        // Pack to Net for loading
        std::shared_ptr<Module> submodule;
        {
            std::unique_ptr<SubGraphProtoT> _tempInfo(graph->UnPack());
            std::unique_ptr<NetT> _tempNet(new NetT);
            _tempNet->oplists = std::move(_tempInfo->nodes);
            _tempNet->tensorName = std::move(_tempInfo->tensors);
            flatbuffers::FlatBufferBuilder builder(1024);
            auto offset = Net::Pack(builder, _tempNet.get());
            builder.Finish(offset);
            if (dynamic) {
                submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), dynamic));
            } else {
                submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs));
            }
            if (graph->name() != nullptr) {
                submodule->setName(graph->name()->str());
            }
        }
        auto key = graph->name()->str();
        SubGraph subgraph;
        subgraph.inputs = std::move(subInputs);
        subgraph.outputs = std::move(subOutputs);
        subgraph.m = submodule;
        subGraphMap.insert(std::make_pair(key, subgraph));
    }
    return subGraphMap;
 }
 struct SubModuleInfo {
    std::vector<int> opList;
    std::vector<int> inputs;;
    std::vector<int> outputs;
    std::vector<uint8_t> tensorMask;
 };
 static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, const std::set<int>& inputIndexes, const std::set<int>& outputIndexes) {
    std::vector<SubModuleInfo> submodule;
    SubModuleInfo current;
    std::vector<int> inputOps;
    // Seperate the graph to serveral submodule
    for (int i=0; i<net->oplists()->size(); ++i) {
        auto op = net->oplists()->GetAs<Op>(i);
        // Collect Input
        if (op->type() == OpType_Input) {
            inputOps.emplace_back(i);
            continue;
        }
        if (op->type() == OpType_If || op->type() == OpType_While) {
            if (current.opList.size() > 0) {
                // Not empty
                submodule.emplace_back(std::move(current));
            }
            SubModuleInfo controlOp;
            controlOp.opList = {i};
            submodule.emplace_back(std::move(controlOp));
            continue;
        }
        current.opList.emplace_back(i);
    }
    if (!current.opList.empty()) {
        submodule.emplace_back(std::move(current));
    }
    /**Compute All SubModule's inputs and outputs*/
    // 0: not use, 1: input, 2: output, 3: mid, 4: valid output
    for (int moduleIndex=0; moduleIndex < submodule.size(); ++moduleIndex) {
        auto& m = submodule[moduleIndex];
        if (1 == m.opList.size()) {
            // Fast way to determine
            auto op = net->oplists()->GetAs<Op>(m.opList[0]);
            if (nullptr != op->inputIndexes()) {
                m.inputs.resize(op->inputIndexes()->size());
                ::memcpy(m.inputs.data(), op->inputIndexes()->data(), m.inputs.size() * sizeof(int));
            }
            if (nullptr != op->outputIndexes()) {
                m.outputs.resize(op->outputIndexes()->size());
                ::memcpy(m.outputs.data(), op->outputIndexes()->data(), m.outputs.size() * sizeof(int));
            }
        } else {
            m.tensorMask = std::vector<uint8_t>(net->tensorName()->size(), 0);
            auto& tensorMask = m.tensorMask;
            for (auto opIndex : m.opList) {
                auto op = net->oplists()->GetAs<Op>(opIndex);
                if (nullptr != op->inputIndexes()) {
                    for (int v=0; v<op->inputIndexes()->size(); ++v) {
                        auto index = op->inputIndexes()->data()[v];
                        tensorMask[index] = tensorMask[index] | 1;
                    }
                }
                if (nullptr != op->outputIndexes()) {
                    for (int v=0; v<op->outputIndexes()->size(); ++v) {
                        auto index = op->outputIndexes()->data()[v];
                        tensorMask[index] = tensorMask[index] | 2;
                    }
                }
            }
            for (int i=0; i<tensorMask.size(); ++i) {
                if (0 == tensorMask[i]) {
                    continue;
                }
                if (1 == tensorMask[i]) {
                    m.inputs.emplace_back(i);
                    continue;
                }
                if (2 == tensorMask[i]) {
                    m.outputs.emplace_back(i);
                    continue;
                }
                if (3 == tensorMask[i]) {
                    if (outputIndexes.find(i) != outputIndexes.end()) {
                        m.outputs.emplace_back(i);
                    }
                }
            }
        }
        // Check if the module's input is valid
        for (int i=0; i<m.inputs.size(); ++i) {
            auto index = m.inputs[i];
            if (inputIndexes.find(index) != inputIndexes.end()) {
                continue;
            }
            bool find = false;
            for (int sub=0; sub < moduleIndex; ++sub) {
                for (auto out : submodule[sub].outputs) {
                    if (out == index) {
                        find = true;
                        break;
                    }
                }
                if (find) {
                    break;
                }
            }
            if (find) {
                continue;
            }
            // Find from module
            for (int sub=0; sub < moduleIndex; ++sub) {
                if (submodule[sub].tensorMask.empty()) {
                    continue;
                }
                if (submodule[sub].tensorMask[index] == 2) {
                    find = true;
                    break;
                }
                if (submodule[sub].tensorMask[index] == 3) {
                    submodule[sub].outputs.emplace_back(index);
                    submodule[sub].tensorMask[index] = 2;
                    find = true;
                    break;
                }
            }
            MNN_ASSERT(find);
        }
    }
    for (auto& m : submodule) {
        m.tensorMask.clear();
    }
    return submodule;
 }
 static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs) {
    if (1 == info.opList.size()) {
        auto op = net->oplists()->GetAs<Op>(info.opList[0]);
        if (OpType_If == op->type()) {
            return IfModule::create(op, subs);
        }
        if (OpType_While == op->type()) {
            return WhileModule::create(op, subs);
        }
        MNN_ASSERT(false);
    }
    std::unique_ptr<NetT> _tempNet(new NetT);
    // Copy Tensor Name
    _tempNet->tensorName.resize(net->tensorName()->size());
    for (int i=0; i<net->tensorName()->size(); ++i) {
        _tempNet->tensorName[i] = net->tensorName()->GetAsString(i)->str();
    }
    // Create Input node
    std::vector<std::string> inputNames;
    for (auto index : info.inputs) {
        std::unique_ptr<OpT> inputOp(new OpT);
        inputOp->outputIndexes = {index};
        inputOp->type = OpType_Input;
        inputOp->main.type = OpParameter_Input;
        inputOp->main.value = new InputT;
        inputOp->main.AsInput()->dims = {0, 0, -1, -1};
        _tempNet->oplists.emplace_back(std::move(inputOp));
        inputNames.emplace_back(_tempNet->tensorName[index]);
    }
    // Create compute node
    for (auto opIndex : info.opList) {
        std::unique_ptr<OpT> op(net->oplists()->GetAs<Op>(opIndex)->UnPack());
        _tempNet->oplists.emplace_back(std::move(op));
    }
    // Get output names
    std::vector<std::string> outputNames;
    for (auto index : info.outputs) {
        outputNames.emplace_back(_tempNet->tensorName[index]);
    }
    // Create Net Buffer
    flatbuffers::FlatBufferBuilder builder(1024);
    auto offset = Net::Pack(builder, _tempNet.get());
    builder.Finish(offset);
    _tempNet.reset();
    return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames);
 }
 Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
    // Create Subgraph
    auto net = GetNet(buffer);
    auto subGraphs = net->subgraphs();
    if (nullptr == net->oplists() || nullptr == net->tensorName()) {
        MNN_ERROR("Invalid net, for null oplist or tensorName\n");
        return nullptr;
    }
    if (!dynamic) {
        if (nullptr == subGraphs) {
            // Has no control flow, can just use static module
            return new StaticModule(buffer, length, inputs, outputs);
        }
    }
    auto subGraphMap = _createSubGraph(net, dynamic);
    if (dynamic) {
        // For dynamic mode
        auto varMaps = Variable::loadMap(buffer, length);
        std::vector<VARP> inputVars(inputs.size());
        for (int i=0; i<inputs.size(); ++i) {
            inputVars[i] = varMaps[inputs[i]];
        }
        std::vector<VARP> outputVars(outputs.size());
        for (int i=0; i<outputs.size(); ++i) {
            outputVars[i] = varMaps[outputs[i]];
        }
        return extract(inputVars, outputVars, false, subGraphMap);
    }
    std::set<int> inputIndexes;
    std::set<int> outputIndexes;
    std::map<std::string, int> inputsMap;
    std::map<std::string, int> outputsMap;
    for (int i=0; i<net->tensorName()->size(); ++i) {
        auto tname = net->tensorName()->GetAsString(i)->str();
        for (auto& s : inputs) {
            if (tname == s) {
                inputIndexes.emplace(i);
                inputsMap.insert(std::make_pair(s, i));
                break;
            }
        }
        for (auto& s : outputs) {
            if (tname == s) {
                outputIndexes.emplace(i);
                outputsMap.insert(std::make_pair(s, i));
                break;
            }
        }
    }
    std::vector<int> inputIndexesVec(inputs.size());
    for (int i=0; i<inputs.size(); ++i) {
        inputIndexesVec[i] = inputsMap[inputs[i]];
    }
    std::vector<int> outputIndexesVec(outputs.size());
    for (int i=0; i<outputs.size(); ++i) {
        outputIndexesVec[i] = outputsMap[outputs[i]];
    }
    auto subModulesInfo = _createSubModuleInfo(net, inputIndexes, outputIndexes);
    std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
    for (int i=0; i<subModulesInfo.size(); ++i) {
        subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap));
    }
    auto result = new PipelineModule;
    /**
     Compute:
     std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
     std::vector<int> mInputIndexes;
     std::vector<int> mOutputIndexes;
     int mStackSize = 0;
     */
    // Make Stack, first: origin, second: new
    std::map<int, int> stackMap;
    int stackIndex = 0;
    for (auto& m : subModulesInfo) {
        for (auto index : m.inputs) {
            if (stackMap.find(index) == stackMap.end()) {
                stackMap.insert(std::make_pair(index, stackIndex));
                stackIndex++;
            }
        }
        for (auto index : m.outputs) {
            if (stackMap.find(index) == stackMap.end()) {
                stackMap.insert(std::make_pair(index, stackIndex));
                stackIndex++;
            }
        }
    }
    result->mStackSize = stackMap.size();
    for (int i=0; i<subModulesInfo.size(); ++i) {
        auto& info = subModulesInfo[i];
        // Reindex stack index
        std::vector<int> subInputs(info.inputs.size());
        for (int i=0; i<info.inputs.size(); ++i) {
            subInputs[i] = stackMap[info.inputs[i]];
        }
        std::vector<int> subOutputs(info.outputs.size());
        for (int i=0; i<info.outputs.size(); ++i) {
            subOutputs[i] = stackMap[info.outputs[i]];
        }
        result->mSubModules.emplace_back(std::make_tuple(subModules[i], subInputs, subOutputs));
    }
    for (int i=0; i<inputIndexesVec.size(); ++i) {
        inputIndexesVec[i] = stackMap[inputIndexesVec[i]];
    }
    for (int i=0; i<outputIndexesVec.size(); ++i) {
        outputIndexesVec[i] = stackMap[outputIndexesVec[i]];
    }
    result->mInputIndexes = std::move(inputIndexesVec);
    result->mOutputIndexes = std::move(outputIndexesVec);
    return result;
 }
 Module* PipelineModule::clone(CloneContext* ctx) const {
    PipelineModule* module(new PipelineModule);
    for (const auto& it : mSubModules) {
        const std::shared_ptr<Module>& submodule = std::get<0>(it);
        const std::vector<int>& input_indices = std::get<1>(it);
        const std::vector<int>& output_indices = std::get<2>(it);
        std::shared_ptr<Module> replica_submodule(submodule->clone(ctx));
        module->mSubModules.push_back(
            std::make_tuple(replica_submodule, input_indices, output_indices));
        module->registerModel({replica_submodule});
    }
    module->mInputIndexes = mInputIndexes;
    module->mOutputIndexes = mOutputIndexes;
    module->mStackSize = mStackSize;
    return this->cloneBaseTo(ctx, module);
 }
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/PipelineModule.hpp
+++ b/tools/train/source/module/PipelineModule.hpp
@ -8,16 +8,20 @@
 #ifndef PipelineModule_hpp
 #define PipelineModule_hpp
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
-#include "NN.hpp"
+#include <MNN/expr/NN.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {
 class MNN_PUBLIC PipelineModule : public Module {
 public:
    typedef std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(Express::EXPRP)> Transformer;
-    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
    static Module* extractOrigin(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain) {
        return extract(inputs, outputs, fortrain);
    }
    static bool turnQuantize(Module* module, const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor, NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
    void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
                      NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
@ -26,14 +30,18 @@ public:
    std::vector<int> countOutputReference(std::vector<int> outputIndices);
 private:
    PipelineModule(){}
    PipelineModule(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs,
                   const Transformer& transformFunction = {});
    Module* clone(CloneContext* ctx) const override;
    std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
    std::vector<Express::VARP> mStack;
    std::vector<int> mInputIndexes;
    std::vector<int> mOutputIndexes;
    int mStackSize = 0;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN
 #endif
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -0,0 +1,186 @@
 //
 //  StaticModule.cpp
 //  MNN
 //
 //  Created by MNN on b'2020/09/10'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "StaticModule.hpp"
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/AutoTime.hpp>
 #include "core/TensorUtils.hpp"
 #include "core/Session.hpp"
 #include <MNN/expr/Executor.hpp>
 #include <MNN/AutoTime.hpp>
 #include <MNN/expr/ExecutorScope.hpp>
 namespace MNN {
 namespace Express {
 StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix) : mInputs(inputs), mOutputs(outputs) {
    mShapeFix = shapeFix;
    mOutputNumbers = (int)outputs.size();
    /** Compute:
     std::vector<int, int> mOutputFromTensor;
     std::vector<int, int> mOutputFromInput;
     */
    for (int i=0; i<outputs.size(); ++i) {
        auto& t = outputs[i];
        bool fromInput = false;
        for (int j=0; j<inputs.size(); ++j) {
            if (inputs[j] == t) {
                fromInput = true;
                mOutputFromInput.emplace_back(std::make_pair(i, j));
                break;
            }
        }
        if (fromInput) {
            continue;
        }
        mOutputFromTensor.emplace_back(i);
    }
    if (mOutputFromTensor.empty()) {
        return;
    }
    mNet.reset(Interpreter::createFromBuffer(buffer, length));
 #ifdef MNN_EXPR_ENABLE_PROFILER
    mNet->setSessionMode(Interpreter::Session_Debug);
 #else
    mNet->setSessionMode(Interpreter::Session_Release);
 #endif
    if (mShapeFix) {
        mNet->setSessionMode(Interpreter::Session_Input_Inside);
    } else {
        mNet->setSessionMode(Interpreter::Session_Input_User);
    }
    auto rt = Express::ExecutorScope::Current()->getRuntime();
    // TODO: Add Config
    ScheduleConfig config;
    config.numThread = 1;
    config.type = rt.first.begin()->first;
    config.saveTensors = outputs;
    mSession = mNet->createSession(config, rt);
    mInputTensors.resize(inputs.size());
    for (int i=0; i<inputs.size(); ++i) {
        mInputTensors[i] = mNet->getSessionInput(mSession, inputs[i].c_str());
    }
    mOutputTensors.resize(mOutputFromTensor.size());
    for (int i=0; i<mOutputFromTensor.size(); ++i) {
        mOutputTensors[i] = mNet->getSessionOutput(mSession, outputs[mOutputFromTensor[i]].c_str());
    }
 }
 StaticModule:: ~ StaticModule() {
    // Do nothing
 }
 std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
    AUTOTIME;
    std::vector<Express::VARP> outputs(mOutputNumbers);
    for (auto& iter : mOutputFromInput) {
        outputs[iter.first] = inputs[iter.second];
    }
    if (mOutputFromTensor.empty()) {
        return outputs;
    }
    MNN_ASSERT(inputs.size() == mInputTensors.size());
    for (int i=0; i<inputs.size(); ++i) {
        auto info = inputs[i]->getInfo();
        mInputTensors[i]->buffer().type = info->type;
        auto des = TensorUtils::getDescribe(mInputTensors[i]);
        if (info->order == Express::NCHW) {
            des->dimensionFormat = MNN_DATA_FORMAT_NCHW;
        }
        if (info->order == Express::NHWC) {
            des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
        }
        if (info->order == Express::NC4HW4) {
            des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
        }
        mNet->resizeTensor(mInputTensors[i], info->dim);
    }
    if (!mShapeFix) {
        for (int i=0; i<inputs.size(); ++i) {
            mInputTensors[i]->buffer().host = (uint8_t*)inputs[i]->readMap<void>();
        }
        // FIXME: Use Interpreter's API
        mSession->setNeedResize();
    }
    mNet->resizeSession(mSession);
    if (mShapeFix) {
        for (int i=0; i<inputs.size(); ++i) {
            // For Shape only usage input, don't alloc memory
            if (nullptr != mInputTensors[i]->host<void>()) {
                ::memcpy(mInputTensors[i]->host<void>(), inputs[i]->readMap<void>(), mInputTensors[i]->size());
            }
        }
    }
 #ifdef MNN_EXPR_ENABLE_PROFILER
    auto globalExecutor = ExecutorScope::Current();
    Timer cost;
    TensorCallBackWithInfo beforeCallBack = [&cost] (const std::vector<Tensor*>&, const OperatorInfo* info) {
        cost.reset();
        return true;
    };
    TensorCallBackWithInfo afterCallBack = [&cost, globalExecutor] (const std::vector<Tensor*>&, const OperatorInfo* info) {
        auto costTimes = (float)cost.durationInUs() / 1000.0f;
        globalExecutor->addOpCostTime(info->type(), costTimes);
        globalExecutor->addOpFlops(info->type(), info->flops());
        return true;
    };
    mNet->runSessionWithCallBackInfo(mSession, beforeCallBack, afterCallBack);
 #else
    mNet->runSession(mSession);
 #endif
    for (int i=0; i<mOutputTensors.size(); ++i) {
        Express::Variable::Info info;
        info.dim = mOutputTensors[i]->shape();
        info.type = mOutputTensors[i]->getType();
        auto format = TensorUtils::getDescribe(mOutputTensors[i])->dimensionFormat;
        info.order = Express::NHWC;
        if (format == MNN_DATA_FORMAT_NCHW) {
            info.order = Express::NCHW;
        } else if (format == MNN_DATA_FORMAT_NC4HW4) {
            info.order = Express::NC4HW4;
        }
        outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, true), 0);
        //::memcpy(outputs[i]->writeMap<void>(), mOutputTensors[i]->host<void>(), mOutputTensors[i]->size());
    }
    return outputs;
 }
 Module* StaticModule::clone(CloneContext* ctx) const {
    StaticModule* module(new StaticModule);
    module->mInputs = mInputs;
    module->mOutputs = mOutputs;
    module->mShapeFix = mShapeFix;
    module->mOutputNumbers = mOutputNumbers;
    module->mOutputFromInput = mOutputFromInput;
    module->mOutputFromTensor = mOutputFromTensor;
    if (mOutputFromTensor.empty()) {
        return this->cloneBaseTo(ctx, module);
    }
    module->mNet = mNet;
    auto rt = Express::ExecutorScope::Current()->getRuntime();
    ScheduleConfig config;
    config.numThread = 1;
    config.type = rt.first.begin()->first;
    config.saveTensors = mOutputs;
    module->mSession = module->mNet->createSession(config, rt);
    module->mInputTensors.resize(mInputs.size());
    module->mOutputTensors.resize(mOutputFromTensor.size());
    for (int i=0; i<mInputs.size(); ++i) {
        module->mInputTensors[i] =
            module->mNet->getSessionInput(module->mSession, mInputs[i].c_str());
    }
    for (int i=0; i<mOutputFromTensor.size(); ++i) {
        module->mOutputTensors[i] = module->mNet->getSessionOutput(
            module->mSession, mOutputs[mOutputFromTensor[i]].c_str());
    }
    return this->cloneBaseTo(ctx, module);
 }
 }
 }
--- a/express/module/StaticModule.hpp
+++ b/express/module/StaticModule.hpp
@ -0,0 +1,44 @@
 //
 //  StaticModule.hpp
 //  MNN
 //
 //  Created by MNN on b'2020/09/10'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef StaticModule_hpp
 #define StaticModule_hpp
 #include <MNN/expr/Module.hpp>
 #include <MNN/Interpreter.hpp>
 namespace MNN {
 namespace Express {
 class StaticModule : public Module {
 public:
    StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix = false);
    virtual ~ StaticModule();
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
 private:
    StaticModule() = default;
    Module* clone(CloneContext* ctx) const override;
    std::vector<std::string> mInputs;
    std::vector<std::string> mOutputs;
    std::shared_ptr<Interpreter> mNet;
    Session* mSession;
    std::vector<Tensor*> mInputTensors;
    std::vector<Tensor*> mOutputTensors;
    bool mShapeFix;
    int mOutputNumbers;
    // First: outputIndex, Second: outputTensor Index
    std::vector<int> mOutputFromTensor;
    // First: outputIndex, Second: input var index
    std::vector<std::pair<int, int>> mOutputFromInput;
 };
 }
 }
 #endif
--- a/express/module/WhileModule.cpp
+++ b/express/module/WhileModule.cpp
@ -0,0 +1,186 @@
 //
 //  WhileModule.cpp
 //  MNN
 //
 //  Created by MNN on b'2020/09/10'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #include "WhileModule.hpp"
 #include <MNN/expr/ExprCreator.hpp>
 #include "MNN_generated.h"
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 namespace MNN {
 namespace Express {
 static int _findPos(const std::vector<std::string>& names, const std::string& key) {
    for (int i=0; i<names.size(); ++i) {
        if (names[i] == key) {
            return i;
        }
    }
    return -1;
 }
 WhileModule* WhileModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
    auto module = new WhileModule;
    auto whileParam = op->main_as_WhileParam();
    auto& body = subGraph.find(whileParam->body_graph()->str())->second;
    auto& cond = subGraph.find(whileParam->cond_graph()->str())->second;
    module->mBody = body.m;
    module->mCond = cond.m;
    /** Compute map index
     int mCondInputNumber;
     int mBodyInputNumber;
     // First mCondInputs' index, Second: inputs's index
     std::vector<std::pair<int, int>> mInputForCond;
     // First mBodyInputs' index, Second: inputs's index
     std::vector<std::pair<int, int>> mInputForBody;
     std::vector<int> mOutputFromBody;
     std::vector<std::pair<int, int>> mUpdateForCond;
     std::vector<std::pair<int, int>> mUpdateForBody;
     std::vector<std::pair<int, int>> mCondUpdateForCond;
     std::vector<std::pair<int, int>> mCondUpdateForBody;
     */
    // Map Inputs
    module->mBodyInputNumber = body.inputs.size();
    module->mCondInputNumber = cond.inputs.size();
    for (int i=0; i<whileParam->aliases_inputs()->size(); ++i) {
        auto index = i;
        auto data = whileParam->aliases_inputs()->GetAs<StringVec>(i);
        for (int s=0; s<data->data()->size(); ++s) {
            auto name = data->data()->GetAsString(s)->str();
            auto bodyInputPos = _findPos(body.inputs, name);
            if (bodyInputPos >= 0) {
                module->mInputForBody.emplace_back(std::make_pair(bodyInputPos, i));
            }
            auto condInputPos = _findPos(cond.inputs, name);
            if (condInputPos >= 0) {
                module->mInputForCond.emplace_back(std::make_pair(condInputPos, i));
            }
        }
    }
    // Map update
    auto update = whileParam->aliases_updates();
    std::map<int, int> replaceOutputs;
    for (int i=0; i<update->size(); ++i) {
        auto data = update->GetAs<StringVec>(i);
        int bodyInputPos = -1;
        int condInputPos = -1;
        int bodyOutputPos = -1;
        int condOutputPos = -1;
        MNN_ASSERT(2 == data->data()->size());
        auto outputName = data->data()->GetAsString(0)->str();
        auto inputName = data->data()->GetAsString(1)->str();
        bodyInputPos = _findPos(body.inputs, inputName);
        condInputPos = _findPos(cond.inputs, inputName);
        bodyOutputPos = _findPos(body.outputs, outputName);
        condOutputPos = _findPos(cond.outputs, outputName);
        auto updateBodyOutputPos = _findPos(body.outputs, inputName);
        MNN_ASSERT(bodyOutputPos == -1 || condOutputPos == -1);
        if (condOutputPos >= 0) {
            if (bodyInputPos >= 0) {
                module->mCondUpdateForBody.emplace_back(std::make_pair(bodyInputPos, condOutputPos));
            }
            if (condInputPos >= 0) {
                module->mCondUpdateForCond.emplace_back(std::make_pair(condInputPos, condOutputPos));
            }
        }
        if (bodyOutputPos >= 0) {
            if (bodyInputPos >= 0) {
                module->mUpdateForBody.emplace_back(std::make_pair(bodyInputPos, bodyOutputPos));
            }
            if (condInputPos >= 0) {
                module->mUpdateForCond.emplace_back(std::make_pair(condInputPos, bodyOutputPos));
            }
            if (updateBodyOutputPos >= 0) {
                replaceOutputs.insert(std::make_pair(updateBodyOutputPos, bodyOutputPos));
            }
        }
    }
    // Map outputs
    auto output = whileParam->aliases_outputs();
    for (int i=0; i<output->size(); ++i) {
        auto data = output->GetAsString(i);
        auto pos = _findPos(body.outputs, data->str());
        MNN_ASSERT(pos >= 0);
        if (replaceOutputs.find(pos) != replaceOutputs.end()) {
            pos = replaceOutputs[pos];
        }
        module->mOutputFromBody.emplace_back(pos);
    }
    return module;
 }
 std::vector<Express::VARP> WhileModule::onForward(const std::vector<Express::VARP>& inputsI) {
    std::vector<Express::VARP> condInputs(mCondInputNumber);
    std::vector<Express::VARP> bodyInputs(mBodyInputNumber);
    auto& inputs = inputsI;
    for (auto& p : mInputForCond) {
        condInputs[p.first] = inputs[p.second];
    }
    for (auto& p : mInputForBody) {
        bodyInputs[p.first] = inputs[p.second];
    }
    std::vector<Express::VARP> outputs(mOutputFromBody.size());
    while (true) {
        auto res = mCond->onForward(condInputs)[0];
        auto resPtr = res->readMap<int>();
        if (resPtr[0] <= 0) {
            break;
        }
        auto bodyOutputs = mBody->onForward(bodyInputs);
        Express::Variable::prepareCompute(bodyOutputs);
        for (int i=0; i<bodyOutputs.size(); ++i) {
            auto p = bodyOutputs[i];
            if (p->expr().first->get() != nullptr) {
                auto ptr = p->readMap<void>();
                auto info = p->getInfo();
                auto newV = Express::_Input(info->dim, info->order, info->type);
                if (nullptr != ptr) {
                    ::memcpy(newV->writeMap<void>(), ptr, info->type.bytes() * info->size);
                }
                bodyOutputs[i] = newV;
            }
        }
        for (int i=0; i<mOutputFromBody.size(); ++i) {
            outputs[i] = bodyOutputs[mOutputFromBody[i]];
        }
        for (auto& p : mUpdateForCond) {
            condInputs[p.first] = bodyOutputs[p.second];
        }
        for (auto& p : mUpdateForBody) {
            bodyInputs[p.first] = bodyOutputs[p.second];
        }
        for (auto& p : mCondUpdateForCond) {
            condInputs[p.first] = res;
        }
        for (auto& p : mCondUpdateForBody) {
            bodyInputs[p.first] = res;
        }
    }
    return outputs;
 }
 Module* WhileModule::clone(CloneContext* ctx) const {
    WhileModule* module(new WhileModule);
    module->mCondInputNumber = mCondInputNumber;
    module->mBodyInputNumber = mBodyInputNumber;
    module->mInputForCond = mInputForCond;
    module->mInputForBody = mInputForBody;
    module->mOutputFromBody = mOutputFromBody;
    module->mUpdateForCond = mUpdateForCond;
    module->mUpdateForBody = mUpdateForBody;
    module->mCondUpdateForCond = mCondUpdateForCond;
    module->mCondUpdateForBody = mCondUpdateForBody;
    module->mCond.reset(mCond->clone(ctx));
    module->mBody.reset(mBody->clone(ctx));
    return this->cloneBaseTo(ctx, module);
 }
 };
 };
--- a/express/module/WhileModule.hpp
+++ b/express/module/WhileModule.hpp
@ -0,0 +1,46 @@
 //
 //  WhileModule.hpp
 //  MNN
 //
 //  Created by MNN on b'2020/09/10'.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef WhileModule_hpp
 #define WhileModule_hpp
 #include <MNN/expr/Module.hpp>
 namespace MNN {
 namespace Express {
 class WhileModule : public Module {
 public:
    virtual ~ WhileModule() {
        // Do nothing
    }
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
    static WhileModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
 private:
    WhileModule(){}
    Module* clone(CloneContext* ctx) const override;
    int mCondInputNumber;
    int mBodyInputNumber;
    // First mCondInputs' index, Second: inputs's index
    std::vector<std::pair<int, int>> mInputForCond;
    // First mBodyInputs' index, Second: inputs's index
    std::vector<std::pair<int, int>> mInputForBody;
    std::vector<int> mOutputFromBody;
    std::vector<std::pair<int, int>> mUpdateForCond;
    std::vector<std::pair<int, int>> mUpdateForBody;
    std::vector<std::pair<int, int>> mCondUpdateForCond;
    std::vector<std::pair<int, int>> mCondUpdateForBody;
    std::shared_ptr<Module> mCond;
    std::shared_ptr<Module> mBody;
 };
 }
 }
 #endif
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -11,6 +11,7 @@
 #include <functional>
 #include <map>
 #include <memory>
 #include <string>
 #include <MNN/ErrorCode.hpp>
 #include <MNN/MNNForwardType.h>
@ -67,6 +68,7 @@ class Session;
 struct Content;
 class Tensor;
 class Backend;
 class Runtime;
 class MNN_PUBLIC OperatorInfo {
    struct Info;
@ -89,6 +91,7 @@ protected:
 typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opName*/)> TensorCallBack;
 typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
 typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;
 /** net data holder. multiple sessions could share same net. */
 class MNN_PUBLIC Interpreter {
@ -108,7 +111,43 @@ public:
    static Interpreter* createFromBuffer(const void* buffer, size_t size);
    ~Interpreter();
    enum SessionMode {
        /** About CallBack, Default Session_Debug*/
        /** runSessionWithCallBack is allowed and can get internal op info*/
        Session_Debug = 0,
        /** runSessionWithCallBack is not valid and can't get any info of op in session*/
        Session_Release = 1,
        /** About input tenosr, Default Session_Input_Inside*/
        /** The input tensor is alloced by session, input data after session resized*/
        Session_Input_Inside = 2,
        /** The input tensor is alloced by user, set input data before session resize*/
        Session_Input_User = 3,
    };
    /**
     * @brief The API shoud be called before create session.
     * @param mode      session mode
     * @return void
     */
    void setSessionMode(SessionMode mode);
    /**
     * @brief The API shoud be called before create session.
     * If the cache exist, try to load cache from file.
     * After createSession, try to save cache to file.
     * @param cacheFile      cache file name
     * @param keySize        the first `keySize` bytes used as the key to check if the `cacheFile` exists.
     * @return void
     */
    void setCacheFile(const char* cacheFile, size_t keySize = 128);
 public:
    /**
     * @brief create runtimeInfo seperately with schedule config.
     * @param config session schedule configs.
     */
    static RuntimeInfo createRuntime(const std::vector<ScheduleConfig>& configs);
    /**
     * @brief create session with schedule config. created session will be managed in net.
     * @param config session schedule config.
@ -116,6 +155,13 @@ public:
     */
    Session* createSession(const ScheduleConfig& config);
    /**
     * @brief create session with schedule config and user-specified runtime.
     * @param config session schedule config, runtime runtimeInfo used by the created session.
     * @return created session if success, NULL otherwise.
     */
    Session* createSession(const ScheduleConfig& config, const RuntimeInfo& runtime);
    /**
     * @brief create multi-path session with schedule configs. created session will be managed in net.
     * @param configs session schedule configs.
@ -123,6 +169,14 @@ public:
     */
    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs);
    /**
     * @brief create multi-path session with schedule configs and user-specified runtime.
              created session will be managed in net.
     * @param configs session schedule configs.
     * @return created session if success, NULL otherwise.
     */
    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime);
    /**
     * @brief release session.
     * @param session   given session.
@ -204,17 +258,39 @@ public:
     */
    Tensor* getSessionOutput(const Session* session, const char* name);
    enum SessionInfoCode {
        /** memory session used in MB, float* */
        MEMORY = 0,
        /** float operation needed in session in M, float* */
        FLOPS = 1,
        /** Backends in session in M, int*, length >= the configs when create session */
        BACKENDS = 2,
        ALL
    };
    /**
-     * @brief get all input tensors.
+     * @brief get session info
     * @param session   given session.
-     * @return all input tensors mapped with name.
+     * @param code      given info code.
     * @param void*     given info ptr, see SessionInfoCode for detail
     * @return true if support the code, false otherwise.
     */
-    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
+    bool getSesionInfo(const Session* session, SessionInfoCode code, void* ptr);
    /**
     * @brief get all output tensors.
     * @param session   given session.
     * @return all output tensors mapped with name.
     */
    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
    /**
     * @brief get all input tensors.
     * @param session   given session.
     * @return all input tensors mapped with name.
     */
    const std::map<std::string, Tensor*>& getSessionInputAll(const Session* session) const;
 public:
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -38,13 +38,7 @@
        }                                                        \
    }
 #else
-#define MNN_ASSERT(x)                                            \
+#define MNN_ASSERT(x)
    {                                                            \
        int res = (x);                                           \
        if (!res) {                                              \
            MNN_ERROR("Error for %d\n", __LINE__); \
        }                                                        \
    }
 #endif
 #define FUNC_PRINT(x) MNN_PRINT(#x "=%d in %s, %d \n", x, __func__, __LINE__);
--- a/include/MNN/MNNForwardType.h
+++ b/include/MNN/MNNForwardType.h
@ -23,8 +23,8 @@ typedef enum {
    /*Hand write metal*/
    MNN_FORWARD_METAL = 1,
-    /*Use IOS's MPS instead of hand-write metal, Not Support yet*/
+    /*NVIDIA GPU API*/
-    MNN_FORWARD_MPS = 2,
+    MNN_FORWARD_CUDA = 2,
    /*Android / Common Device GPU API*/
    MNN_FORWARD_OPENCL = 3,
--- a/include/MNN/Tensor.hpp
+++ b/include/MNN/Tensor.hpp
@ -12,6 +12,7 @@
 #include <vector>
 #include <MNN/HalideRuntime.h>
 #include <MNN/MNNDefine.h>
 #define MNN_MAX_TENSOR_DIM 6
 namespace MNN {
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -10,6 +10,7 @@
 #include <MNN/ErrorCode.hpp>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/Tensor.hpp>
 #include <MNN/Interpreter.hpp>
 #include <vector>
 #include <mutex>
 #include <set>
@ -17,41 +18,19 @@
 namespace MNN {
 class Backend;
 class Execution;
 class Runtime;
 struct Op;
 namespace Express {
 class MNN_PUBLIC Executor {
 public:
-    class ComputeCache {
+    class ComputeCache;
    public:
        void setShapeDirty(int offset, Variable::Info* info);
        void setContentDirty();
        void setContentReady();
        void syncInput(int offset, const Variable::Info* info);
        void syncOutput(int offset, Variable::Info* info);
        struct TensorContent {
            std::shared_ptr<Tensor> tensor;
            int refCount = 0;
            void reset();
            bool aliveOutside = false;
        };
    struct Unit;
-        virtual ~ ComputeCache() {}
+    static void setShapeDirty(ComputeCache* cache);
-        ComputeCache() {}
+    static void setContentDirty(ComputeCache* cache);
-        virtual ErrorCode compute() = 0;
+    static void* mapOutput(ComputeCache* cache, int offset, Tensor* dest);
        virtual ErrorCode resize() = 0;
    protected:
        // Get the index tensor with the need of needBackend
        // If the Tensor don't belong to the backend, need use needBackend to alloc it and return
        virtual Tensor* getTensor(int index, bool host) = 0;
        void _setShapeDirty();
        friend class Executor;
        bool mContentDirty = true;
        bool mShapeDirty = true;
    };
    struct Requirement {
        std::vector<bool> contentNeedContent;
        std::vector<bool> shapeNeedContent;
        std::vector<bool> supportError;
    };
    ~Executor();
    Requirement getRequirement(Expr* expr) const;
@ -65,25 +44,27 @@ public:
    };
    void gc(GCFlag flag = FULL);
    static std::shared_ptr<Executor> getGlobalExecutor();
    static std::shared_ptr<Executor> newExecutor(MNNForwardType type,
                                                 const BackendConfig& config,
                                                 int numberThread);
    void resetProfile();
    void dumpProfile();
    void addOpCostTime(int op, float costTime);
    void addOpCostTime(const std::string& type, float costTime);
    void addOpFlops(const std::string& type, float flops);
    class Profiler;
    static RuntimeInfo getRuntime();
 private:
-    void _createSingle(EXPRP expr);
+    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
-    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::vector<ComputeCache::TensorContent>&& tensors, bool forceCPU);
+    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::set<std::shared_ptr<Expr::Inside>>&& inputNode, bool forceCPU);
-    void _addToCache(const std::vector<std::shared_ptr<ComputeCache>>& caches);
+    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::set<std::shared_ptr<Expr::Inside>>& inputNode);
    void _resetCache();
    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::vector<ComputeCache::TensorContent>& tensors);
-    Executor(std::shared_ptr<Backend> backend);
+    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type);
-    std::shared_ptr<Backend> mBackend;
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mRuntime;
-    std::shared_ptr<Backend> mBackupBackend;
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mBackupRuntime;
    std::mutex mMutex;
    std::vector<std::shared_ptr<Tensor>> mStack;
    std::vector<Tensor*> mStackInputs;
    std::vector<Tensor*> mStackOutputs;
    std::shared_ptr<Profiler> mProfiler;
 };
 } // namespace Express
--- a/include/MNN/expr/ExecutorScope.hpp
+++ b/include/MNN/expr/ExecutorScope.hpp
@ -0,0 +1,33 @@
 //
 //  ExecutorScope.hpp
 //  MNN
 //
 //  Created by MNN on 2020/10/26.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef MNN_EXPR_EXECUTOR_SCOPE_HPP_
 #define MNN_EXPR_EXECUTOR_SCOPE_HPP_
 #include <MNN/expr/Executor.hpp>
 namespace MNN {
 namespace Express {
 struct ExecutorScope final {
 public:
    ExecutorScope() = delete;
    explicit ExecutorScope(const ExecutorScope&) = delete;
    explicit ExecutorScope(const std::shared_ptr<Executor>& current);
    explicit ExecutorScope(const std::string& scope_name,
                           const std::shared_ptr<Executor>& current);
    virtual ~ExecutorScope();
    static const std::shared_ptr<Executor> Current();
 };
 }  // namespace MNN
 }  // namespace Express
 #endif  // MNN_EXPR_EXECUTOR_SCOPE_HPP_
--- a/include/MNN/expr/Expr.hpp
+++ b/include/MNN/expr/Expr.hpp
@ -87,6 +87,7 @@ public:
    };
    bool fix(InputType type) const;
 private:
    friend class Variable;
    std::shared_ptr<Variable> mContent;
 };
 inline bool operator==(Variable* src, VARP dst) {
@ -107,7 +108,6 @@ public:
        INTS dim;
        halide_type_t type;
        int size;
        void* ptr = nullptr;
        void syncSize();
    };
    const std::string& name() const;
@ -173,7 +173,7 @@ private:
 class MNN_PUBLIC Expr {
 public:
    struct Inside;
-    static EXPRP create(Variable::Info&& info);
+    static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy = true);
    static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
    static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
    static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
@ -188,7 +188,7 @@ public:
        return mInputs;
    }
    int outputSize() const {
-        return mOutputNames.size();
+        return (int)mOutputNames.size();
    }
    static void replace(EXPRP oldExpr, EXPRP newExpr);
    bool requireInfo();
--- a/tools/train/source/module/Module.hpp
+++ b/tools/train/source/module/Module.hpp
@ -8,9 +8,14 @@
 #ifndef MNN_Train_Module_hpp
 #define MNN_Train_Module_hpp
 #include <vector>
 #include <unordered_map>
 #include <MNN/expr/Expr.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {
 class MNN_PUBLIC Module {
 public:
    Module()                                                                               = default;
@ -21,9 +26,6 @@ public:
    bool loadParameters(const std::vector<Express::VARP>& parameters);
    void setIsTraining(const bool isTraining);
    bool getIsTraining();
    static std::shared_ptr<Module> transform(const std::vector<Express::VARP>& inputs,
                                             const std::vector<Express::VARP>& outputs);
    void clearCache();
    const std::string& name() const {
@ -38,12 +40,45 @@ public:
    void setType(std::string type) {
        mType = std::move(type);
    }
    // Return the parameter index
    int addParameter(Express::VARP parameter);
    void setParameter(Express::VARP parameter, int index);
    static Module* createEmpty(const std::vector<Express::VARP>& parameters);
    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic = false);
    static Module* clone(const Module* module, const bool shareParams = false);
    class CloneContext {
    public:
        CloneContext() = default;
        explicit CloneContext(const bool shareParams)
            : mShareParams(shareParams) {}
        virtual ~CloneContext() = default;
        const bool shareParams() const { return mShareParams; }
        EXPRP getOrClone(const EXPRP expr);
        VARP getOrClone(const VARP var);
    private:
        bool mShareParams = false;
        std::unordered_map<const Expr*, EXPRP> mExprMap;
        std::unordered_map<const Variable*, VARP> mVarMap;
    };
    virtual Module* clone(CloneContext* ctx) const {
        return nullptr;
    }
 protected:
    void registerModel(const std::vector<std::shared_ptr<Module>>& children);
    void addParameter(Express::VARP parameter);
    virtual void onClearCache() {
    }
    Module* cloneBaseTo(CloneContext* ctx, Module* module) const;
 private:
    void _collectParameters(std::vector<Express::VARP>& result) const;
    std::vector<std::shared_ptr<Module>> mChildren;
@ -52,6 +87,13 @@ private:
    std::string mName;
    std::string mType;
 };
 struct SubGraph {
    std::vector<std::string> inputs;
    std::vector<std::string> outputs;
    std::shared_ptr<Module> m;
 };
 } // namespace Train
 } // namespace MNN
--- a/tools/train/source/module/NN.hpp
+++ b/tools/train/source/module/NN.hpp
@ -9,11 +9,10 @@
 #ifndef MNN_Train_NN_hpp
 #define MNN_Train_NN_hpp
 #include <MNN/expr/ExprCreator.hpp>
-#include "Distributions.hpp"
+#include <MNN/expr/Module.hpp>
 #include "Module.hpp"
 #include <vector>
 namespace MNN {
-namespace Train {
+namespace Express {
 class Initializer;
 class MNN_PUBLIC NN {
@ -29,7 +28,7 @@ public:
    };
    enum FeatureScaleStatMethod {
        PerTensor = 0,
-        PerChannel = 1
+        PerChannel = 1 // Depercerate
    };
    /* Unlike enum in class, class in class need be dllimport or dllexport explcility.
       Compiling in other system will not be affected.
@ -86,7 +85,7 @@ public:
        static ConvParameters ExtractConvolution(Express::EXPRP expr);
        // Extract BatchNormal and Dropout
-        static Module* ExtractNotRunableOp(Express::EXPRP expr);
+        static Module* ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs);
    };
 };
--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@ -31,25 +31,30 @@ MNN_PUBLIC VARP _Const(const void* ptr, INTS shape = {}, Dimensionformat format
 MNN_PUBLIC VARP _TrainableParam(float value, INTS dims, Dimensionformat format);
 MNN_PUBLIC VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format,
                                  halide_type_t type = halide_type_of<float>());
 MNN_PUBLIC VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape);
 MNN_PUBLIC VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                      INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
 MNN_PUBLIC VARP _Conv(float weight, float bias, VARP x, INTS channel, INTS kernelSize, PaddingMode pad = VALID,
                      INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1);
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
+                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false, int nbits = 8);
 MNN_PUBLIC VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
 MNN_PUBLIC VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                                INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
 MNN_PUBLIC VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
 PaddingMode pad, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
 MNN_PUBLIC VARP _MaxPool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
 MNN_PUBLIC VARP _AvePool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
-MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NHWC);
+MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NCHW);
 MNN_PUBLIC VARP _Reshape(VARP x, VARP shape);
 MNN_PUBLIC VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias);
 MNN_PUBLIC VARP _Relu(VARP x, float slope = 0.0f);
-MNN_PUBLIC VARP _Relu6(VARP x);
+MNN_PUBLIC VARP _Relu6(VARP x, float minValue = 0.0f, float maxValue = 6.0f);
 MNN_PUBLIC VARP _PRelu(VARP x, std::vector<float> &&slopes);
 MNN_PUBLIC VARP _Softmax(VARP logits, int axis = -1);
 MNN_PUBLIC VARP _Softplus(VARP features);
@ -76,7 +81,7 @@ MNN_PUBLIC VARP _Pad(VARP x, VARP paddings, PadValueMode mode = CONSTANT);
 MNN_PUBLIC VARP _ExpandDims(VARP input, int axis);
 MNN_PUBLIC VARP _ExpandDims(VARP input, VARP axis);
-MNN_PUBLIC VARP _Shape(VARP input);
+MNN_PUBLIC VARP _Shape(VARP input, bool nchw = false);
 MNN_PUBLIC VARP _Stack(VARPS values, int axis=0);
 enum InterpolationMethod {BILINEAR, NEAREST};
 MNN_PUBLIC VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, 
@ -92,6 +97,7 @@ MNN_PUBLIC VARP _GatherND(VARP params, VARP indices);
 MNN_PUBLIC VARP _Selu(VARP features, float scale, float alpha);
 MNN_PUBLIC VARP _Size(VARP input);
 MNN_PUBLIC VARP _Elu(VARP features, float alpha=1.0);
 MNN_PUBLIC VARP _Threshold(VARP features, float alpha=1.0);
 MNN_PUBLIC VARP _MatrixBandPart(VARP input, VARP num_lower, VARP num_upper);
 MNN_PUBLIC std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims);
 MNN_PUBLIC VARP _SetDiff1D(VARP x, VARP y); 
@ -123,7 +129,8 @@ MNN_PUBLIC VARP _ZeroGrad(VARP x);
 // Int8 Inference
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu);
+                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits = 8);
 MNN_PUBLIC VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim);
 MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, char minValue, char maxValue);
 MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale);
--- a/include/MNN/expr/Scope.hpp
+++ b/include/MNN/expr/Scope.hpp
@ -0,0 +1,102 @@
 //
 //  RuntimeScope.hpp
 //  MNN
 //
 //  Created by MNN on 2020/10/26.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 #ifndef MNN_EXPR_SCOPE_HPP_
 #define MNN_EXPR_SCOPE_HPP_
 #include <cstdio>
 #include <vector>
 #include <string>
 #include <mutex>
 #include <MNN/Interpreter.hpp>
 namespace MNN {
 namespace Express {
 template <typename T>
 class Scope {
 public:
    Scope();
    virtual ~Scope() = default;
    struct ScopedContent {
        std::string scope_name;
        T content;
    };
    void EnterScope(const ScopedContent& current);
    void EnterScope(const T& current);
    void EnterScope(const std::string& scope_name, const T& current);
    void ExitScope();
    const ScopedContent& Current() const;
    int ScopedLevel() const { return scoped_level_; }
 private:
    std::string MakeScopeName(const std::string& prefix, int level) const;
    mutable std::mutex mutex_;
    int scoped_level_ = 0;
    std::vector<ScopedContent> scoped_contents_;
 };
 template <typename T>
 Scope<T>::Scope() : scoped_level_(0) {
 }
 template <typename T>
 void Scope<T>::EnterScope(const ScopedContent& current) {
    std::lock_guard<std::mutex> lock(mutex_);
    ++scoped_level_;
    scoped_contents_.push_back(current);
 }
 template <typename T>
 void Scope<T>::EnterScope(const T& current) {
    EnterScope("scope", current);
 }
 template <typename T>
 void Scope<T>::EnterScope(const std::string& scope_name,
                          const T& current) {
    std::lock_guard<std::mutex> lock(mutex_);
    int scoped_level = ScopedLevel();
    std::string name = MakeScopeName(scope_name, scoped_level++);
    ScopedContent content{name, current};
    ++scoped_level_;
    scoped_contents_.push_back(content);
 }
 template <typename T>
 void Scope<T>::ExitScope() {
    std::lock_guard<std::mutex> lock(mutex_);
    --scoped_level_;
    scoped_contents_.resize(scoped_level_);
 }
 template <typename T>
 const typename Scope<T>::ScopedContent& Scope<T>::Current() const {
    std::lock_guard<std::mutex> lock(mutex_);
    MNN_CHECK(scoped_contents_.size() > 0, "Scope level should not be 0.");
    return scoped_contents_.back();
 }
 template <typename T>
 std::string Scope<T>::MakeScopeName(const std::string& prefix,
                                    int level) const {
    char s[16];
    snprintf(s, 16, "%d", level);
    return prefix + "/" + std::string(s);
 }
 }  // namespace Express
 }  // namespace MNN
 #endif  // MNN_EXPR_SCOPE_HPP_
--- a/package_scripts/win_package.ps1
+++ b/package_scripts/win_package.ps1
@ -1,12 +1,14 @@
-#   MNN_Windows
+# MNN
-#     |------- MNN_Windows_lib
+#  |-- Debug
-#                   |---------- Dynamic_Library
+#  |     |--- MD
-#                   |---------- Static_Library
+#  |     |--- MT
-#     |------- MNN_Windows_tools
+#  |-- Release
 #        |--- MD
 #        |--- MT
 $erroractionpreference = "stop"
-Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN_Windows"
+Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"
 #clear and create package directory
 powershell ./schema/generate.ps1
@ -14,32 +16,50 @@ Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
 Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
 mkdir $WINDOWS_PACKAGE_PATH\
 cd $WINDOWS_PACKAGE_PATH
-mkdir -p MNN_Windows_lib\Dynamic_Library
+mkdir -p Debug\MD
-mkdir -p MNN_Windows_lib\Static_Library
+mkdir -p Debug\MT
-mkdir MNN_Windows_tools
+mkdir -p Release\MD
 mkdir -p Release\MT
 cd ..
 Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
-cd build
+pushd build
 # tools without dependency, static library without sep_build
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
+#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
 #ninja
 #pushd $WINDOWS_PACKAGE_PATH
 #cp ..\build\*.exe MNN_Windows_tools
 #cp ..\build\*.pdb MNN_Windows_tools
 #cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
 #popd
 Remove-Item CMakeCache.txt -ErrorAction Ignore
 cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
 ninja
-pushd $WINDOWS_PACKAGE_PATH
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
-cp ..\build\*.exe MNN_Windows_tools
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
-cp ..\build\*.pdb MNN_Windows_tools
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT
-cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
+
 Remove-Item CMakeCache.txt -ErrorAction Ignore
 cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
 ninja
 cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
 cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
 cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
 Remove-Item CMakeCache.txt -ErrorAction Ignore
 cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
 ninja
 cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
 cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
 cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
 Remove-Item CMakeCache.txt -ErrorAction Ignore
 cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
 ninja
 cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
 cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
 cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
 popd
 #dynamic library without sep_build
 rm .\CMakeCache.txt
 cmake -G "Ninja" -DMNN_SEP_BUILD=OFF ..
 ninja
 cd $WINDOWS_PACKAGE_PATH
 cp ..\build\MNN.lib MNN_Windows_lib\Dynamic_Library
 cp ..\build\MNN.dll MNN_Windows_lib\Dynamic_Library
 cp ..\build\MNN.pdb MNN_Windows_lib\Dynamic_Library
 # Compress MNN_Windows_lib and MNN_Windows_tools
 Compress-Archive -Path MNN_Windows_lib -DestinationPath MNN_Windows_lib.zip -Update -CompressionLevel Optimal
 Compress-Archive -Path MNN_Windows_tools -DestinationPath MNN_Windows_tools.zip -Update -CompressionLevel Optimal
--- a/project/android/CMakeExports.txt
+++ b/project/android/CMakeExports.txt
@ -8,15 +8,14 @@ set_target_properties(
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN.so
                )
 add_library( MNN_Arm82 SHARED IMPORTED GLOBAL)
 set_target_properties(
                MNN_Arm82
                PROPERTIES IMPORTED_LOCATION
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Arm82.so
                )
 add_library( MNN_CL SHARED IMPORTED GLOBAL )
 set_target_properties( MNN_CL
                PROPERTIES IMPORTED_LOCATION
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_CL.so
                )
 add_library( MNN_Express SHARED IMPORTED GLOBAL )
 set_target_properties( MNN_Express
                PROPERTIES IMPORTED_LOCATION
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Express.so
                )
--- a/project/android/updateTest.sh
+++ b/project/android/updateTest.sh
@ -5,7 +5,6 @@ adb push ./libMNN_CL.so /data/local/tmp/MNN/libMNN_CL.so
 adb push ./libMNN_Vulkan.so /data/local/tmp/MNN/libMNN_Vulkan.so
 adb push ./libMNN_GL.so /data/local/tmp/MNN/libMNN_GL.so
 adb push ./libMNN_Express.so /data/local/tmp/MNN/libMNN_Express.so
 adb push ./libMNN_Arm82.so /data/local/tmp/MNN/libMNN_Arm82.so
 adb push ./MNNV2Basic.out /data/local/tmp/MNN/MNNV2Basic.out
 adb shell "cd /data/local/tmp/MNN && rm -r output"
 adb shell "cd /data/local/tmp/MNN && mkdir output"
@ -18,3 +17,4 @@ adb push ./timeProfile.out /data/local/tmp/MNN/timeProfile.out
 adb push ./train.out /data/local/tmp/MNN/train.out
 adb push ./benchmark.out /data/local/tmp/MNN/benchmark.out
 adb push ./benchmarkExprModels.out /data/local/tmp/MNN/benchmarkExprModels.out
 adb push ./run_test.out /data/local/tmp/MNN/run_test.out
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
--- a/project/ios/MNN/Info.plist
+++ b/project/ios/MNN/Info.plist
@ -4,6 +4,8 @@
 <dict>
 	<key>CFBundleDevelopmentRegion</key>
 	<string>$(DEVELOPMENT_LANGUAGE)</string>
 	<key>CFBundleIdentifier</key>
 	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>
--- a/project/ios/MNN/OpRegister.sh
+++ b/project/ios/MNN/OpRegister.sh
@ -1,57 +0,0 @@
 #!bin/sh
 echo "Register Op Begin"
 function read_dir(){
 	str1=`grep -e $2 $1/*.$4|sed s/[[:space:]]//g`
 	array=(${str1//\;/ })
 	for var in ${array[@]}; do
 	    `echo $var|awk -F $3 '{
 			a="___";
 			b="__();";
 			c="extern void ";
 			print(c""a""$3"__"$4""b) >> "extern";
 			print (a""$3"__"$4""b) >> "call"
 		}'`
 	done
 }
 start=$(date +%s)
 SEP='[:(,)]'
 FILE_EXTERN_CPP='cpp'
 FILE_EXTERN_MM='mm'
 SHELL_FOLDER=$(dirname $0)'/../../..'
 # handle CPU
 CPUFILE=$SHELL_FOLDER/source/backend/cpu/CPUOPRegister.cpp
 echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $CPUFILE
 echo "Start Register CPU"
 CPU=$SHELL_FOLDER/source/backend/cpu
 CPU_KEY='REGISTER_CPU_OP_CREATOR'
 read_dir $CPU $CPU_KEY $SEP $FILE_EXTERN_CPP
 cat extern >> $CPUFILE
 rm extern
 echo '\nvoid registerCPUOps() {' >> $CPUFILE
 cat call >> $CPUFILE
 echo '}\n#endif\n}' >> $CPUFILE
 rm call
 # handle Shape
 echo "Start Register Shape"
 SHAPEFILE=$SHELL_FOLDER/source/shape/ShapeRegister.cpp
 SHAPE=$SHELL_FOLDER/source/shape
 SHAPE_KEY="REGISTER_SHAPE"
 echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $SHAPEFILE
 read_dir $SHAPE $SHAPE_KEY $SEP $FILE_EXTERN_CPP
 cat extern >> $SHAPEFILE
 rm extern
 echo '\nvoid registerShapeOps() {' >> $SHAPEFILE
 cat call >> $SHAPEFILE
 echo '}\n#endif\n}' >> $SHAPEFILE
 rm call
 echo "Register Op End"
 dur=$(echo "$(date +%s) - $start" | bc)
 printf "Execution time: %.6f seconds" $dur
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@ -8,10 +8,14 @@
 #import "AppDelegate.h"
 #import "MNNTestSuite.h"
 #import <MNN/expr/Executor.hpp>
@implementation AppDelegate
 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
    MNN::BackendConfig config;
    // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
    MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
    MNNTestSuite::runAll();
    return YES;
 }
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@ -8,6 +8,9 @@ import cv2
 def inference():
    """ inference mobilenet_v1 using a specific picture """
    interpreter = MNN.Interpreter("mobilenet_v1.mnn")
    interpreter.setCacheFile('.tempcache')
    config = {}
    config['precision'] = 'low'
    session = interpreter.createSession()
    input_tensor = interpreter.getSessionInput(session)
    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
--- a/pymnn/examples/MNNTrain/mnist/train_mnist.py
+++ b/pymnn/examples/MNNTrain/mnist/train_mnist.py
@ -96,8 +96,7 @@ def demo():
    train_dataloader = MNN.data.DataLoader(train_dataset, batch_size = 64, shuffle = True)
    test_dataloader = MNN.data.DataLoader(test_dataset, batch_size = 100, shuffle = False)
-    opt = MNN.optim.SGD(0.01, 0.9, 0.0005)
+    opt = MNN.optim.SGD(model, 0.01, 0.9, 0.0005)
    opt.append(model.parameters)
    F.set_thread_number(4)
--- a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
+++ b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
@ -125,8 +125,7 @@ def demo():
    net = Net(feature_extractor, num_classes)
-    opt = MNN.optim.SGD(1e-3, 0.9, 0.00004)
+    opt = MNN.optim.SGD(net, 1e-3, 0.9, 0.00004)
    opt.append(net.parameters)
    for epoch in range(10):
        train_func(net, train_dataloader, opt, num_classes)
--- a/pymnn/examples/MNNTrain/module_save/grad_test.py
+++ b/pymnn/examples/MNNTrain/module_save/grad_test.py
@ -0,0 +1,15 @@
 import numpy as np
 import MNN
 nn = MNN.nn
 F = MNN.expr
 v0 = F.const([0.3,0.1, -0.3,0.4], [4])
 v2 = F.const([0.3,0.1, -0.3,0.4], [4])
 v1 = v0 * v0
 outputDiff = F.const([0.05, 0.03, 0.02, 0.01], [4])
 v0Grad = nn.grad(v1, [v0, v2], [outputDiff], "")
 print(v0Grad)
 print(v0Grad[0].read())
 F.save(v0Grad, "temp.grad")
--- a/pymnn/examples/MNNTrain/module_save/test_save.py
+++ b/pymnn/examples/MNNTrain/module_save/test_save.py
@ -0,0 +1,36 @@
 import numpy as np
 import MNN
 nn = MNN.nn
 F = MNN.expr
 class Net(nn.Module):
    """construct a lenet 5 model"""
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.conv(1, 20, [5, 5])
        self.conv2 = nn.conv(20, 50, [5, 5])
        self.fc1 = nn.linear(800, 500)
        self.fc2 = nn.linear(500, 10)
        self.step = F.const([10], [], F.NCHW, F.int)
        self.lr = F.const([0.0004],[], F.NCHW, F.float)
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool(x, [2, 2], [2, 2])
        x = F.relu(self.conv2(x))
        x = F.max_pool(x, [2, 2], [2, 2])
        x = F.reshape(x, [0, -1])
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.softmax(x, 1)
        return x
 model = Net()
 F.save(model.parameters, 'mnist.snapshot')
 model2 = Net()
 model2.load_parameters(F.load_as_list('mnist.snapshot'))
 print(model2.lr.read())
 print(model2.step.read())
--- a/Show More
+++ b/Show More