Github release 1.1.0

2020-11-05 16:41:56 +08:00 · 2020-11-05 16:41:56 +08:00 · d6795ad031
parent 939a80dba8
commit d6795ad031
1296 changed files with 98954 additions and 55065 deletions
--- a/.gitignore
+++ b/.gitignore
@ -330,7 +330,6 @@ project/android/.idea/caches/build_file_checksums.ser
 # FIXME(haijing): Xcode pre-build stage breaks compilation of flatbuffers by setting envs that do cmake cross-compilation for iOS
 # schema/current
 schema/private
-schema/current
 tools/converter/source/IR
 benchmark/benchmark.txt

@ -345,18 +344,13 @@ pymnn/android/.idea/modules.xml
 pymnn/android/.idea/runConfigurations.xml
 pymnn/android/.idea/vcs.xml
 pymnn/android/.idea/caches/build_file_checksums.ser
+pymnn/src/pybind_private/

 buildios
 build*/
 include/MNN/VCS.h
-source/backend/opencl/execution/cl/codegen/opencl_program.cc
-source/backend/opencl/execution/cl/opencl_program.cc
-# FIXME(haijing): MTL issues.....
-# source/backend/metal/MetalOPRegister.mm
 source/backend/opengl/AllShader.cpp
 include/MNN/backend/opengl/shaders/AllShader.h
-source/backend/vulkan/compiler/AllShader.cpp
-include/MNN/backend/vulkan/shaders/AllShader.h
 .idea
 project/ios/ios_64
 project/ios/ios_32
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -49,6 +49,7 @@ include(FindPythonInterp REQUIRED)
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
 option(MNN_BUILD_SHARED_LIBS "MNN build shared or static lib" ON)
+option(MNN_WIN_RUNTIME_MT "MNN use /MT on Windows dll" OFF)
 option(MNN_FORBID_MULTI_THREAD "Disable Multi Thread" OFF)
 option(MNN_OPENMP "Use OpenMP's thread pool implementation. Does not work on iOS or Mac OS" OFF)
 option(MNN_USE_THREAD_POOL "Use MNN's own thread pool implementation" ON)
@ -62,14 +63,14 @@ option(MNN_SUPPORT_TFLITE_QUAN "Enable MNN's tflite quantized op" ON)
 option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
-option(MNN_OPENCL_LWS_TUNE "Enable MNN OpenCL Lws Tuning" ON)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression seperately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
 option(NATIVE_INCLUDE_OUTPUT "Native Include Path" OFF)
 option(MNN_AAPL_FMWK "Build MNN.framework instead of traditional .a/.dylib" OFF)
-option(MNN_FMA_ENABLE "x86 routine use fma extension" OFF)
 option(MNN_WITH_PLUGIN "Build with plugin op support." OFF)
+option(MNN_BUILD_MINI "Build MNN-MINI that just supports fixed shape models." OFF)
+option(MNN_USE_SSE "Use SSE optimization for x86 if possiable" ON)

 IF(NOT MNN_BUILD_SHARED_LIBS)
  message(WARNING "Close MNN_SEP_BUILD for static library")
@ -79,13 +80,14 @@ IF(APPLE AND MNN_AAPL_FMWK AND MNN_SEP_BUILD)
  message(WARNING "MNN_SEP_BUILD AND MNN_AAPL_FMWK can't coexist. Turning off MNN_SEP_BUILD")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
 ENDIF()
-IF(MSVC OR WIN32)
+IF(WIN32)
  IF(MNN_SEP_BUILD)
    message(WARNING "MNN_SEP_BUILD IS TROUBLESOME ON Windows. Forcing OFF...")
    SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  ENDIF()
-  SET(MNN_USE_SYSTEM_LIB ON CACHE BOOL "<docstring>" FORCE)
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)

+  IF(MSVC)
    # generate optimized (release) exe and library with pdb debug file, https://stackoverflow.com/a/31264946
    SET(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
    SET(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
@ -95,11 +97,12 @@ IF(MSVC OR WIN32)
    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4018 /wd4251 /wd4996 /wd4244 /wd4146 /wd4129 /wd4305 /wd4275")
  ENDIF()
+ENDIF()

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/macros.cmake)

 IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT MNN_BUILD_SHARED_LIBS AND NOT (MSVC OR WIN32))
-  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static")
+  SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
  SET(MNN_SEP_BUILD OFF CACHE BOOL "<docstring>" FORCE)
  IF(MNN_BUILD_CONVERTER)
    SET(MNN_PORTABLE_BUILD ON CACHE BOOL "<docstring>" FORCE)
@ -117,6 +120,9 @@ endif()
 if(MNN_SUPPORT_TFLITE_QUAN)
    add_definitions(-DMNN_SUPPORT_TFLITE_QUAN)
 endif()
+if(MNN_BUILD_MINI)
+    add_definitions(-DMNN_BUILD_MINI)
+endif()

 # debug options
 if(MNN_DEBUG_MEMORY)
@ -128,9 +134,6 @@ endif()
 if(MNN_GPU_TRACE)
    add_definitions(-DMNN_GPU_FORCE_FINISH)
 endif()
-if(MNN_OPENCL_LWS_TUNE)
-    add_definitions(-DMNN_OPENCL_LWS_TUNE)
-endif()

 # backend options
 option(MNN_METAL "Enable Metal" OFF)
@ -138,11 +141,8 @@ option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_ARM82 "Enable ARM82" OFF)
-
-# codegen register ops
-if (MNN_METAL)
-    add_definitions(-DMNN_CODEGEN_REGISTER)
-endif()
+option(MNN_CUDA "Enable CUDA" OFF)
+option(MNN_TENSORRT "Enable TensorRT" OFF)

 # target options
 option(MNN_BUILD_BENCHMARK "Build benchmark or not" OFF)
@ -165,11 +165,13 @@ message(STATUS "\tOpenCL: ${MNN_OPENCL}")
 message(STATUS "\tOpenGL: ${MNN_OPENGL}")
 message(STATUS "\tVulkan: ${MNN_VULKAN}")
 message(STATUS "\tARM82: ${MNN_ARM82}")
+message(STATUS "\tTensorRT: ${MNN_TENSORRT}")
+message(STATUS "\tCUDA: ${MNN_CUDA}")
 message(STATUS "\tOpenMP: ${MNN_OPENMP}")
 message(STATUS "\tHidden: ${MNN_HIDDEN}")
 message(STATUS "\tBuild Path: ${CMAKE_CURRENT_BINARY_DIR}")

-if(WIN32)
+if(MSVC)
    if(${CMAKE_VERSION} VERSION_LESS "3.14.0")
      message(FATAL_ERROR "MNN requires CMake 3.14+ to build on Windows!")
    endif()
@ -178,14 +180,14 @@ if(WIN32)
        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO
        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-        if (MNN_BUILD_SHARED_LIBS)
-            if(${flag_var} MATCHES "/MT")
-                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
-            endif()
-        else ()
+        if (MNN_WIN_RUNTIME_MT)
            if(${flag_var} MATCHES "/MD")
                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
            endif()
+        else ()
+            if(${flag_var} MATCHES "/MT")
+                string(REGEX REPLACE "/MT" "/MD" ${flag_var} "${${flag_var}}")
+            endif()
        endif ()
    endforeach()
 elseif(CMAKE_SYSTEM_NAME MATCHES "^Android" OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -270,6 +272,8 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "^Linux")
 endif()
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/
                    ${CMAKE_CURRENT_LIST_DIR}/source/
+                    ${CMAKE_CURRENT_LIST_DIR}/express/
+                    ${CMAKE_CURRENT_LIST_DIR}/tools/
                    ${CMAKE_CURRENT_LIST_DIR}/schema/current/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/
                    ${CMAKE_CURRENT_LIST_DIR}/3rd_party/flatbuffers/include
@ -293,14 +297,14 @@ FILE(GLOB MNN_CV_SRC ${CMAKE_CURRENT_LIST_DIR}/source/cv/*)
 add_library(MNNCV OBJECT ${MNN_CV_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCV>)
 list(APPEND MNN_TARGETS MNNCV)
+if (MNN_USE_SSE)
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
-    if(WIN32 OR MSVC)
-        target_compile_options(MNNCV PRIVATE /arch:AVX)
-    else()
+        if (NOT MSVC)
            target_compile_options(MNNCV PRIVATE -msse3)
            target_compile_options(MNNCV PRIVATE -mavx)
        endif()
    endif()
+endif()

 # Math
 FILE(GLOB MNN_Math_SRC ${CMAKE_CURRENT_LIST_DIR}/source/math/*)
@ -308,11 +312,19 @@ add_library(MNNMath OBJECT ${MNN_Math_SRC})
 list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMath>)
 list(APPEND MNN_TARGETS MNNMath)

-# Shape
-FILE(GLOB MNN_Shape_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/*)
-add_library(MNNShape OBJECT ${MNN_Shape_SRC})
-list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNShape>)
-list(APPEND MNN_TARGETS MNNShape)
+# Transform
+FILE(GLOB MNN_Transform_SRC ${CMAKE_CURRENT_LIST_DIR}/source/shape/* ${CMAKE_CURRENT_LIST_DIR}/source/geometry/*)
+add_library(MNNTransform OBJECT ${MNN_Transform_SRC})
+IF (NOT MNN_BUILD_MINI)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNTransform>)
+ENDIF()
+list(APPEND MNN_TARGETS MNNTransform)
+
+# Utils
+FILE(GLOB MNN_Utils_SRC ${CMAKE_CURRENT_LIST_DIR}/source/utils/*)
+add_library(MNNUtils OBJECT ${MNN_Utils_SRC})
+list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNUtils>)
+list(APPEND MNN_TARGETS MNNUtils)

 # Compute
 FILE(GLOB MNN_Compute_SRC ${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/compute/*)
@ -327,7 +339,9 @@ list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCPU>)
 list(APPEND MNN_TARGETS MNNCPU)

 # X86_64 AVX/SSE
+if (MNN_USE_SSE)
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/x86_x64/CMakeLists.txt)
+endif()

 # AArch32/64 Assemblies
 include(${CMAKE_CURRENT_LIST_DIR}/source/backend/cpu/arm/CMakeLists.txt)
@ -377,7 +391,7 @@ if (NOT APPLE)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-      if (WIN32)
+      if (MSVC)
          set(OpenMP_C_FLAGS "/openmp ${OpenMP_C_FLAGS}")
          set(OpenMP_CXX_FLAGS "/openmp ${OpenMP_CXX_FLAGS}")
      endif()
@ -387,20 +401,22 @@ endif()

 set(CMAKE_CXX_FLAGS_ORIGIN ${CMAKE_CXX_FLAGS})
 set(CMAKE_C_FLAGS_ORIGIN ${CMAKE_C_FLAGS})
-if ((NOT (MSVC OR WIN32)) AND MNN_HIDDEN)
+if ((NOT MSVC) AND MNN_HIDDEN)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility-inlines-hidden -fvisibility=hidden")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
-    if (NOT APPLE)
+    # Omit frame pointer may cause difficult debug
+    if ((NOT APPLE) AND (NOT WIN32))
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fomit-frame-pointer")
    endif()
 endif()
-if (NOT (MSVC OR WIN32))
+if (NOT MSVC)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
 endif()

 # Metal
-include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
+set(MNN_DEPS "")
+set(MNN_EXTRA_DEPENDS "")
 list(APPEND MNN_DEPS MNN)

 # Plugin
@ -409,6 +425,14 @@ if(MNN_WITH_PLUGIN)
    include(${CMAKE_CURRENT_LIST_DIR}/source/plugin/CMakeLists.txt)
 endif()

+# Metal
+if(MNN_METAL AND APPLE)
+    add_definitions(-DMNN_METAL_ENABLED=1)
+    include(${CMAKE_CURRENT_LIST_DIR}/source/backend/metal/CMakeLists.txt)
+    list(APPEND MNN_TARGETS MNNMetal)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNMetal>)
+endif()
+
 # Vulkan
 IF(MNN_VULKAN)
  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/vulkan/)
@ -446,22 +470,34 @@ IF(MNN_OPENGL)
  ENDIF()
 ENDIF()

+# CUDA
+IF(MNN_CUDA)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/cuda/)
+  list(APPEND MNN_TARGETS MNN_CUDA)
+  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_CUDA>)
+  list(APPEND MNN_EXTRA_DEPENDS ${MNN_CUDA_LIBS})
+ENDIF()
+
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR IOS_ARCH STREQUAL "arm64")
 # ARM82 Assemblies
  IF(MNN_ARM82)
    add_definitions(-DENABLE_ARMV82)
    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/arm82/)
-    IF(MNN_SEP_BUILD)
-      list(APPEND MNN_DEPS MNN_Arm82)
-    ELSE()
    list(APPEND MNN_TARGETS MNN_Arm82)
    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
  ENDIF()
 ENDIF()
-ENDIF()
 # Express
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/express/)

+# TensorRT
+IF(MNN_TENSORRT)
+  add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/tensorrt/)
+  list(APPEND MNN_TARGETS MNN_TRT)
+  list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_TRT>)
+  list(APPEND MNN_EXTRA_DEPENDS ${MNN_TRT_LIBS})
+ENDIF()
+
 IF(MNN_SEP_BUILD)
  add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
@ -471,7 +507,7 @@ ELSE()
  list(APPEND MNN_TARGETS MNNExpress)
  IF(MNN_BUILD_SHARED_LIBS)
    add_library(MNN SHARED ${CMAKE_CURRENT_LIST_DIR}/cmake/dummy.cpp ${MNN_OBJECTS_TO_LINK} ${MNN_PUB_HDRS} ${MNN_EXPR_PUB_HDRS})
-    if (MSVC OR WIN32)
+    if (WIN32)
      foreach(TARGET ${MNN_TARGETS})
        target_compile_definitions(${TARGET} PRIVATE "-DBUILDING_MNN_DLL")
        target_compile_definitions(${TARGET} INTERFACE "-DUSING_MNN_DLL")
@ -484,7 +520,7 @@ ELSE()
  ENDIF()
  target_link_libraries(MNN PUBLIC ${MNN_EXTRA_DEPENDS})
 ENDIF()
-if (MSVC OR WIN32)
+if (MSVC)
  target_link_options(MNN PRIVATE "/IGNORE:4049,4217")
 endif()

@ -504,9 +540,11 @@ if(APPLE)
      target_link_libraries(MNN PUBLIC ${FOUNDATION})
      find_library(METAL Metal REQUIRED)
      target_link_libraries(MNN PUBLIC ${METAL})
+      find_library(GRAPHIC CoreGraphics)
+      target_link_libraries(MNN PUBLIC ${GRAPHIC})
    ENDIF()
 endif()
-add_dependencies(MNN MNNCore MNNCV MNNShape MNNMath MNNCompute MNNCPU GenVCSHDR)
+add_dependencies(MNN MNNCore MNNCV MNNTransform MNNMath MNNCompute MNNCPU GenVCSHDR)
 add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/tools/converter)

 if(CMAKE_SYSTEM_NAME MATCHES "^Linux")
@ -532,12 +570,6 @@ if (NOT MNN_BUILD_SHARED_LIBS)
    endif()
 endif()
 list(APPEND MNN_TARGETS MNN)
-  FOREACH(TARGET ${MNN_TARGETS})
-    IF((NOT MSVC) AND (NOT WIN32))
-    else()
-      target_compile_definitions(${TARGET} PRIVATE _CRT_SECURE_NO_WARNINGS)
-    endif()
-  ENDFOREACH()
 list(REMOVE_ITEM MNN_TARGETS MNN)
 IF(MNN_BUILD_DEMO)
 include(${CMAKE_CURRENT_LIST_DIR}/demo/exec/CMakeLists.txt)
--- a/MNN.podspec
+++ b/MNN.podspec
@ -46,6 +46,7 @@ Pod::Spec.new do |s|
  'schema/current/*.{h}',\
  '3rd_party/flatbuffers/include/flatbuffers/*.{h}',\
  'source/core/**/*.{h,c,m,mm,cc,hpp,cpp}',\
+  'source/geometry/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/cv/**/*.{h,c,m,mm,cc,hpp,cpp}',\
  'source/math/**/*.{h,c,m,mm,cc,hpp,cpp,metal}',\
  'source/shape/*.{h,c,m,mm,cc,hpp,cpp}',\
--- a/README_CN.md
+++ b/README_CN.md
@ -66,7 +66,7 @@ Interpreter由Engine和Backends构成。前者负责模型的加载、计算图

 三群：

-<img src="doc/DingTalkQR3.png" height="256"/>
+<img src="doc/DingTalkQR23.png" height="256"/>

 ## License
 Apache 2.0
--- a/backupcode/cpubackend/CPUBatchMatMul.cpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.cpp
@ -0,0 +1,89 @@
+//
+//  CPUBatchMatMul.cpp
+//  MNN
+//
+//  Created by MNN on 2019/03/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUBatchMatMul.hpp"
+#include "backend/cpu/CPUBackend.hpp"
+#include "math/Matrix.hpp"
+
+namespace MNN {
+
+CPUBatchMatMul::CPUBatchMatMul(Backend* backend, bool adjX, bool adjY) : Execution(backend) {
+    mMatMul.reset(new CPUMatMul(backend, adjX, adjY, true));
+}
+
+ErrorCode CPUBatchMatMul::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input0          = inputs[0];
+    auto input1          = inputs[1];
+    auto output          = outputs[0];
+    // Fill output by zero if one of inputs is empty.
+    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
+        return NO_ERROR;
+    }
+    auto dimensions = input0->dimensions();
+    mMatrixA.reset(Tensor::createDevice<float>({input0->length(input0->dimensions()-2), input0->length(input0->dimensions()-1)}));
+    mMatrixB.reset(Tensor::createDevice<float>({input1->length(input1->dimensions()-2), input1->length(input0->dimensions()-1)}));
+    mMatrixC.reset(Tensor::createDevice<float>({output->length(output->dimensions()-2), output->length(output->dimensions()-1)}));
+    mTempInputs = {mMatrixA.get(), mMatrixB.get()};
+    mTempOutputs = {mMatrixC.get()};
+    auto res = backend()->onAcquireBuffer(mMatrixA.get(), Backend::DYNAMIC);
+    res = res && backend()->onAcquireBuffer(mMatrixB.get(), Backend::DYNAMIC);
+    res = res && backend()->onAcquireBuffer(mMatrixC.get(), Backend::DYNAMIC);
+
+    if (!res) {
+        return OUT_OF_MEMORY;
+    }
+    int batch = 1;
+    for (int i = 0; i < dimensions - 2; ++i) {
+        batch *= input0->length(i);
+    }
+    mBatch = batch;
+    auto code = mMatMul->onResize(mTempInputs, mTempOutputs);
+    backend()->onReleaseBuffer(mMatrixA.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mMatrixB.get(), Backend::DYNAMIC);
+    backend()->onReleaseBuffer(mMatrixC.get(), Backend::DYNAMIC);
+    return code;
+}
+
+ErrorCode CPUBatchMatMul::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
+    auto input0          = inputs[0];
+    auto input1          = inputs[1];
+    auto output          = outputs[0];
+    // Fill output by zero if one of inputs is empty.
+    if (input0->elementSize() == 0 || input1->elementSize() == 0) {
+        ::memset(output->host<float>(), 0, output->size());
+        return NO_ERROR;
+    }
+    const int dimensions = input0->dimensions();
+    MNN_ASSERT(dimensions >= 3);
+    const int input0Stride = input0->length(dimensions - 1) * input0->length(dimensions - 2);
+    const int input1Stride = input1->length(dimensions - 1) * input1->length(dimensions - 2);
+    const int outputStride = output->length(dimensions - 1) * output->length(dimensions - 2);
+    const auto input0Ptr   = input0->host<float>();
+    const auto input1Ptr   = input1->host<float>();
+    float* const outputPtr = output->host<float>();
+
+    for (int i = 0; i < mBatch; ++i) {
+        ::memcpy(mMatrixA->host<float>(), input0Ptr + i * input0Stride, input0Stride * sizeof(float));
+        ::memcpy(mMatrixB->host<float>(), input1Ptr + i * input1Stride, input1Stride * sizeof(float));
+        mMatMul->onExecute(mTempInputs, mTempOutputs);
+        ::memcpy(outputPtr + i * outputStride, mMatrixC->host<float>(), outputStride * sizeof(float));
+    }
+    return NO_ERROR;
+}
+
+class CPUBatchMatMulCreator : public CPUBackend::Creator {
+public:
+    virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                                const MNN::Op* op, Backend* backend) const override {
+        return new CPUBatchMatMul(backend, op->main_as_BatchMatMulParam()->adjX(), op->main_as_BatchMatMulParam()->adjY());
+    }
+};
+
+REGISTER_CPU_OP_CREATOR(CPUBatchMatMulCreator, OpType_BatchMatMul);
+
+} // namespace MNN
--- a/backupcode/cpubackend/CPUBatchMatMul.hpp
+++ b/backupcode/cpubackend/CPUBatchMatMul.hpp
@ -0,0 +1,35 @@
+//
+//  CPUBatchMatMul.hpp
+//  MNN
+//
+//  Created by MNN on 2019/03/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPUBatchMatMul_hpp
+#define CPUBatchMatMul_hpp
+
+#include "backend/cpu/CPUMatMul.hpp"
+
+namespace MNN {
+
+class CPUBatchMatMul : public Execution {
+public:
+    CPUBatchMatMul(Backend *backend, bool adjX, bool adjY);
+    virtual ~CPUBatchMatMul() = default;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int mBatch;
+    std::shared_ptr<Execution> mMatMul;
+    std::vector<Tensor*> mTempInputs;
+    std::vector<Tensor*> mTempOutputs;
+    std::shared_ptr<Tensor> mMatrixA;
+    std::shared_ptr<Tensor> mMatrixB;
+    std::shared_ptr<Tensor> mMatrixC;
+};
+
+} // namespace MNN
+
+#endif /* CPUBatchMatMul_hpp */
--- a/backupcode/cpubackend/CPUConvolution3D.cpp
+++ b/backupcode/cpubackend/CPUConvolution3D.cpp
@ -18,7 +18,6 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/ConvolutionFloatFactory.h"
-#include "math/Vec4.hpp"

 #define MIN_CON_PLANESIZE 256

--- a/backupcode/cpubackend/CPUConvolution3D.hpp
+++ b/backupcode/cpubackend/CPUConvolution3D.hpp
--- a/backupcode/cpubackend/CPUCosineSimilarity.cpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.cpp
@ -10,7 +10,9 @@
 #include <math.h>
 #include "backend/cpu/CPUBackend.hpp"
 #include "core/Macro.h"
-#include "math/Vec4.hpp"
+#include "math/Vec.hpp"
+
+using Vec4 = MNN::Math::Vec<float, 4>;

 namespace MNN {

@ -39,12 +41,12 @@ ErrorCode CPUCosineSimilarity::onExecute(const std::vector<Tensor*>& inputs, con
            const auto x1ChannelPtr = x1DataBatchPtr + j;
            const auto x2ChannelPtr = x2DataBatchPtr + j;

-            Math::Vec4 innerProduct(.0f);
-            Math::Vec4 x1Square(.0f);
-            Math::Vec4 x2Square(.0f);
+            Vec4 innerProduct(.0f);
+            Vec4 x1Square(.0f);
+            Vec4 x2Square(.0f);
            for (int c = 0; c < channel; ++c) {
-                Math::Vec4 x1Data = Math::Vec4::load(x1ChannelPtr + c * channleStride);
-                Math::Vec4 x2Data = Math::Vec4::load(x2ChannelPtr + c * channleStride);
+                Vec4 x1Data = Vec4::load(x1ChannelPtr + c * channleStride);
+                Vec4 x2Data = Vec4::load(x2ChannelPtr + c * channleStride);
                auto x1Xx2        = x1Data * x2Data;
                innerProduct      = innerProduct + x1Xx2;
                x1Square          = x1Square + x1Data * x1Data;
--- a/backupcode/cpubackend/CPUCosineSimilarity.hpp
+++ b/backupcode/cpubackend/CPUCosineSimilarity.hpp
--- a/backupcode/cpubackend/CPUDilation2D.cpp
+++ b/backupcode/cpubackend/CPUDilation2D.cpp
@ -12,8 +12,8 @@
 #include "core/Concurrency.h"
 #include "core/Macro.h"

-#include "math/Vec4.hpp"
-using MNN::Math::Vec4;
+#include "math/Vec.hpp"
+using Vec4 = MNN::Math::Vec<float, 4>;

 namespace MNN {

--- a/backupcode/cpubackend/CPUDilation2D.hpp
+++ b/backupcode/cpubackend/CPUDilation2D.hpp
--- a/backupcode/cpubackend/CPUElu.cpp
+++ b/backupcode/cpubackend/CPUElu.cpp
--- a/backupcode/cpubackend/CPUElu.hpp
+++ b/backupcode/cpubackend/CPUElu.hpp
--- a/backupcode/cpubackend/CPUInnerProduct.cpp
+++ b/backupcode/cpubackend/CPUInnerProduct.cpp
@ -21,7 +21,7 @@ public:
        auto parameter  = op->main_as_InnerProduct();
        int outputCount = parameter->outputCount();
        int srcCount    = parameter->weight()->size() / outputCount;
-        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4));
+        mWeight.reset(CPUConvolution::reorderWeightSize(srcCount, outputCount, 1, 4, 4));
        if (mWeight.get() == nullptr) {
            mValid = false;
            return;
--- a/backupcode/cpubackend/CPUInnerProduct.hpp
+++ b/backupcode/cpubackend/CPUInnerProduct.hpp
--- a/backupcode/cpubackend/CPULRN.cpp
+++ b/backupcode/cpubackend/CPULRN.cpp
--- a/backupcode/cpubackend/CPULRN.hpp
+++ b/backupcode/cpubackend/CPULRN.hpp
--- a/backupcode/cpubackend/CPULSTM.cpp
+++ b/backupcode/cpubackend/CPULSTM.cpp
@ -180,6 +180,14 @@ ErrorCode CPULSTM::onResize(const std::vector<Tensor *> &inputs, const std::vect
            ::memcpy(mBiasC->host<float>(), mLSTM->bias()->float32s()->data(), mBiasC->size());
            ::memcpy(mWeightH->host<float>(), mLSTM->weightH()->float32s()->data(), mWeightH->size());
        }
+        if (mGateHaveBias) {
+            // Merge bias
+            auto biasPtr = mBiasC->host<float>();
+            auto biasPtr2 = biasPtr + 4 * numUnits;
+            for (int i=0; i<4*numUnits; ++i) {
+                biasPtr[i] = biasPtr[i] + biasPtr2[i];
+            }
+        }
    }

    if (inputs.size() > 1) {
@ -260,16 +268,8 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
    MNN_CONCURRENCY_END();

    float* biasStartPtr = mBiasC->host<float>();
-    if(!mGateHaveBias){
-        biasStartPtr = nullptr;
-    }
    mRetriveOutputFunction(mGates.host<float>(), biasStartPtr);

-    float* recurrenceBiasStartPtr = mBiasC->host<float>();
-    if(mGateHaveBias){
-        recurrenceBiasStartPtr += 4 * numUnits;
-    }
-
    // tranform
    const float *contData = nullptr;
    if (inputs.size() > 1) {
@ -330,14 +330,11 @@ ErrorCode CPULSTM::onExecute(const std::vector<Tensor *> &inputs, const std::vec
                    }

                    // add bias
-                    auto biasPtr = recurrenceBiasStartPtr + oc;
-                    I            = sigmoid(*biasPtr + I);
-                    biasPtr      = biasPtr + numUnits;
-                    F            = sigmoid(*biasPtr + F);
-                    biasPtr      = biasPtr + numUnits;
-                    O            = sigmoid(*biasPtr + O);
-                    biasPtr      = biasPtr + numUnits;
-                    G            = tanhf(*biasPtr + G);
+                    //MNN_PRINT("%f, %f, %f, %f\n", I, O, F, G);
+                    I            = sigmoid(I);
+                    F            = sigmoid(F);
+                    O            = sigmoid(O);
+                    G            = tanhf(G);

                    auto newCell   = F * cellData[oc] + I * G;
                    cellData[oc]   = newCell;
--- a/backupcode/cpubackend/CPULSTM.hpp
+++ b/backupcode/cpubackend/CPULSTM.hpp
--- a/backupcode/cpubackend/CPUNormalize.cpp
+++ b/backupcode/cpubackend/CPUNormalize.cpp
--- a/backupcode/cpubackend/CPUNormalize.hpp
+++ b/backupcode/cpubackend/CPUNormalize.hpp
--- a/backupcode/cpubackend/CPUSelu.cpp
+++ b/backupcode/cpubackend/CPUSelu.cpp
--- a/backupcode/cpubackend/CPUSelu.hpp
+++ b/backupcode/cpubackend/CPUSelu.hpp
--- a/backupcode/cpubackend/CPUSoftmax.cpp
+++ b/backupcode/cpubackend/CPUSoftmax.cpp
@ -0,0 +1,311 @@
+//
+//  CPUSoftmax.cpp
+//  MNN
+//
+//  Created by MNN on 2018/07/16.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "backend/cpu/CPUSoftmax.hpp"
+#include <math.h>
+#include "backend/cpu/CPUBackend.hpp"
+#include "backend/cpu/compute/CommonOptFunction.h"
+#include "core/Concurrency.h"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+
+namespace MNN {
+
+int CPUSoftmax::_softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum) {
+    // Max and sub
+    MNN_CONCURRENCY_BEGIN(tId, threadNum)
+    {
+        const float *srcY = srcData + tId * channel;
+        float *dstY       = dstData + tId * channel;
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += channel * threadNum, dstY += channel * threadNum) {
+            float maxValue = srcY[0];
+            {
+                int c = 1;
+#ifdef MNN_USE_NEON
+#if !(defined(__ARM_FEATURE_FMA) && defined(__aarch64__))
+#define vmaxvq_f32(v)                 \
+    ({                                \
+        float __m = v[0];             \
+        for (int i = 1; i < 4; i++) { \
+            if (v[i] > __m)           \
+                __m = v[i];           \
+        }                             \
+        __m;                          \
+    })
+#endif
+                if (c + 3 < channel) {
+                    float32x4_t maxx4 = vld1q_f32(srcY + c);
+                    c += 4;
+                    for (; c + 3 < channel; c += 4) {
+                        maxx4 = vmaxq_f32(maxx4, vld1q_f32(srcY + c));
+                    }
+                    float value = vmaxvq_f32(maxx4);
+                    if (value > maxValue)
+                        maxValue = value;
+                }
+#endif
+                for (; c < channel; ++c) {
+                    float value = srcY[c];
+                    if (value > maxValue)
+                        maxValue = value;
+                }
+            }
+
+            for (int c = 0; c < channel; ++c) {
+                dstY[c] = -srcY[c] + maxValue;
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+    
+    //Exp
+    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(channel * outside);
+    int sizeDivide = schedule.first;
+    int scheduleNumber = schedule.second;
+
+    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+        int start = sizeDivide * (int)tId;
+        int realSize = sizeDivide;
+        if (tId == scheduleNumber -1 ) {
+            realSize = channel * outside - start;
+        }
+        if (realSize > 0) {
+            MNNExp(dstData + start, dstData + start, realSize);
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    // Sum and div
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        float *dstY       = dstData + tId * channel;
+        for (int y = (int)tId; y < outside; y += threadNum, dstY += channel * threadNum) {
+            // sum
+            float sumValue = 0;
+
+            for (int c = 0; c < channel; ++c) {
+                sumValue += dstY[c];
+            }
+
+            // div
+            {
+                int c = 0;
+#ifdef MNN_USE_NEON
+                float div = 1.f / sumValue;
+                for (; c + 3 < channel; c += 4) {
+                    vst1q_f32(dstY + c, vmulq_n_f32(vld1q_f32(dstY + c), div));
+                }
+#endif
+                for (; c < channel; ++c) {
+                    dstY[c] /= sumValue;
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    return 0;
+}
+int CPUSoftmax::_softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel,
+                               float *maxValue, float *sumValue, int threadNum) {
+    if (inside == 1)
+        return _softmax1(srcData, dstData, outside, channel, threadNum);
+
+    const int stepY = inside * channel;
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        const float *srcY  = srcData + tId * stepY;
+        float *dstY        = dstData + tId * stepY;
+        float *maxValueSub = maxValue + tId * inside;
+
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
+            memcpy(maxValueSub, srcY, sizeof(float) * inside);
+            const float *src = srcY + inside;
+            for (int c = 1; c < channel; ++c, src += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    if (src[x] > maxValueSub[x])
+                        maxValueSub[x] = src[x];
+                }
+            }
+            src        = srcY;
+            float *dst = dstY;
+            for (int c = 0; c < channel; ++c, src += inside, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    dst[x] = -src[x] + maxValueSub[x];
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+
+    auto totalSize = channel * inside * outside;
+    //Exp
+    auto schedule = ((CPUBackend*)backend())->multiThreadDivide(totalSize);
+    int sizeDivide = schedule.first;
+    int scheduleNumber = schedule.second;
+
+    MNN_CONCURRENCY_BEGIN(tId, scheduleNumber) {
+        int start = sizeDivide * (int)tId;
+        int realSize = sizeDivide;
+        if (tId == scheduleNumber -1 ) {
+            realSize = totalSize - start;
+        }
+        if (realSize > 0) {
+            MNNExp(dstData + start, dstData + start, realSize);
+        }
+    }
+    MNN_CONCURRENCY_END();
+    
+    MNN_CONCURRENCY_BEGIN(tId, threadNum);
+    {
+        const float *srcY  = srcData + tId * stepY;
+        float *dstY        = dstData + tId * stepY;
+        float *sumValueSub = sumValue + tId * inside;
+        for (int y = (int)tId; y < outside; y += threadNum, srcY += stepY * threadNum, dstY += stepY * threadNum) {
+            memset(sumValueSub, 0, sizeof(float) * inside);
+            float *dst = dstY;
+            for (int c = 0; c < channel; ++c, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    sumValueSub[x] += dst[x];
+                }
+            }
+            dst = dstY;
+            for (int c = 0; c < channel; ++c, dst += inside) {
+                for (int x = 0; x < inside; ++x) {
+                    dst[x] /= sumValueSub[x];
+                }
+            }
+        }
+    }
+    MNN_CONCURRENCY_END();
+    return 0;
+}
+
+ErrorCode CPUSoftmax::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto input           = inputs[0];
+    const int dimensions = input->buffer().dimensions;
+
+    const auto layout = TensorUtils::getDescribe(input)->dimensionFormat;
+    mNeedUnpackC4     = layout == MNN_DATA_FORMAT_NC4HW4;
+
+    if (mNeedUnpackC4) {
+        int totalSize = 1;
+        for (int i = 1; i < dimensions; ++i) {
+            totalSize *= input->length(i);
+        }
+        mStorage.buffer().dim[0].extent = input->length(0);
+        mStorage.buffer().dim[1].extent = totalSize;
+        TensorUtils::getDescribe(&mStorage)->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        mStorage.buffer().dimensions    = 2;
+        mStorage.buffer().type          = input->getType();
+        backend()->onAcquireBuffer(&mStorage, Backend::DYNAMIC);
+    }
+
+    int inside = 1;
+    int dims   = input->buffer().dimensions;
+    for (int i = mAxis + 1; i < dims; ++i) {
+        inside *= input->length(i);
+    }
+
+    if (inside != 1) { // not run _softmax1, we need maxValue Tensor and sumValue Tensor.
+        int threadNum = ((CPUBackend *)backend())->threadNumber();
+
+        mMaxValue.buffer().dim[0].extent = inside * threadNum;
+        mMaxValue.buffer().dimensions    = 1;
+        mMaxValue.setType(DataType_DT_FLOAT);
+        backend()->onAcquireBuffer(&mMaxValue, Backend::DYNAMIC);
+
+        mSumValue.buffer().dim[0].extent = inside * threadNum;
+        mSumValue.buffer().dimensions    = 1;
+        mSumValue.setType(DataType_DT_FLOAT);
+        backend()->onAcquireBuffer(&mSumValue, Backend::DYNAMIC);
+
+        backend()->onReleaseBuffer(&mMaxValue, Backend::DYNAMIC);
+        backend()->onReleaseBuffer(&mSumValue, Backend::DYNAMIC);
+    }
+
+    if (mNeedUnpackC4) {
+        backend()->onReleaseBuffer(&mStorage, Backend::DYNAMIC);
+    }
+
+    return NO_ERROR;
+}
+
+ErrorCode CPUSoftmax::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(1 == inputs.size());
+    MNN_ASSERT(1 == outputs.size());
+    auto inputTensor        = inputs[0];
+    auto outputTensor       = outputs[0];
+    const auto inputDataPtr = inputTensor->host<float>();
+    auto outputDataPtr      = outputTensor->host<float>();
+    const int batch         = inputTensor->batch();
+    const auto dims         = inputTensor->buffer().dimensions;
+
+    float *tempData = nullptr;
+    if (mNeedUnpackC4) {
+        tempData = mStorage.host<float>();
+    }
+
+    int areaInput = 1;
+    for (int i = 2; i < dims; ++i) {
+        areaInput *= inputTensor->length(i);
+    }
+    int inside  = 1;
+    int outside = 1;
+    int channel = 1;
+    for (int i = 0; i < mAxis; ++i) {
+        outside *= inputTensor->length(i);
+    }
+    channel = inputTensor->length(mAxis);
+    for (int i = mAxis + 1; i < dims; ++i) {
+        inside *= inputTensor->length(i);
+    }
+
+    int threadNum = ((CPUBackend *)backend())->threadNumber();
+    if (!mNeedUnpackC4) {
+        _softmaxCommon(inputDataPtr, outputDataPtr, inside, outside, channel, mMaxValue.host<float>(),
+                   mSumValue.host<float>(), threadNum);
+        return NO_ERROR;
+    }
+    auto outputSize = outputTensor->elementSize();
+    int batchSize = outputSize / batch;
+    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
+        auto inputData  = inputDataPtr + batchIndex * batchSize;
+        MNNUnpackC4(outputDataPtr + batchIndex * mStorage.length(1), inputData, areaInput, inputTensor->channel());
+    }
+    _softmaxCommon(outputDataPtr, tempData, inside, outside, channel, mMaxValue.host<float>(), mSumValue.host<float>(), threadNum);
+    for (int batchIndex = 0; batchIndex < batch; ++batchIndex) {
+        auto outputData = outputDataPtr + batchIndex * batchSize;
+        auto tempPtr = tempData + batchIndex * mStorage.length(1);
+        MNNPackC4(outputData, tempPtr, areaInput, outputTensor->channel());
+    }
+    return NO_ERROR;
+}
+
+CPUSoftmax::CPUSoftmax(Backend *b, int axis) : MNN::Execution(b), mAxis(axis), mStorage(2), mNeedUnpackC4(false) {
+    // nothing to do
+}
+
+class CPUSoftmaxCreator : public CPUBackend::Creator {
+public:
+    virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
+                                const MNN::Op *op, Backend *backend) const override {
+        auto axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        return new CPUSoftmax(backend, axis);
+    }
+};
+
+REGISTER_CPU_OP_CREATOR(CPUSoftmaxCreator, OpType_Softmax);
+
+} // namespace MNN
--- a/backupcode/cpubackend/CPUSoftmax.hpp
+++ b/backupcode/cpubackend/CPUSoftmax.hpp
@ -0,0 +1,35 @@
+//
+//  CPUSoftmax.hpp
+//  MNN
+//
+//  Created by MNN on 2018/07/16.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef CPUSoftmax_hpp
+#define CPUSoftmax_hpp
+
+#include "core/Execution.hpp"
+
+namespace MNN {
+class CPUSoftmax : public Execution {
+public:
+    CPUSoftmax(Backend *b, int axis);
+    virtual ~CPUSoftmax() = default;
+    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
+
+private:
+    int _softmaxCommon(const float *srcData, float *dstData, int inside, int outside, int channel, float *maxValue,
+                       float *sumValue, int threadNum);
+    int _softmax1(const float *srcData, float *dstData, int outside, int channel, int threadNum);
+
+    int mAxis;
+    Tensor mStorage;
+    Tensor mMaxValue;
+    Tensor mSumValue;
+    bool mNeedUnpackC4;
+};
+} // namespace MNN
+
+#endif /* CPUSoftmax_hpp */
--- a/backupcode/cpubackend/CPUSpatialProduct.cpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.cpp
--- a/backupcode/cpubackend/CPUSpatialProduct.hpp
+++ b/backupcode/cpubackend/CPUSpatialProduct.hpp
--- a/backupcode/cpubackend/CPUThreshold.cpp
+++ b/backupcode/cpubackend/CPUThreshold.cpp
--- a/backupcode/cpubackend/CPUThreshold.hpp
+++ b/backupcode/cpubackend/CPUThreshold.hpp
--- a/source/backend/cpu/compute/Convolution3D3x3.cpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.cpp
@ -13,10 +13,8 @@
 #include "backend/cpu/compute/ConvOpt.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
-#include "math/Vec4.hpp"
-using namespace MNN::Math;
-
-typedef Vec4 float4;
+#include "math/Vec.hpp"
+using Vec4 = MNN::Math::Vec<float, 4>;

 #define SOURCE_BLOCK 64
 #define WEIGHT_BLOCK 256
--- a/source/backend/cpu/compute/Convolution3D3x3.hpp
+++ b/source/backend/cpu/compute/Convolution3D3x3.hpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.cpp
--- a/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
+++ b/source/backend/cpu/compute/ConvolutionWinograd3D.hpp
--- a/backupcode/geometry/GeometryCropAndResize.cpp
+++ b/backupcode/geometry/GeometryCropAndResize.cpp
@ -0,0 +1,128 @@
+//
+//  GeometryCropAndResize.cpp
+//  MNN
+//
+//  Created by MNN on 2020/08/5.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
+#include "ConvertUtils.hpp"
+
+namespace MNN {
+class GeometryCropAndResize : public GeometryComputer {
+public:
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(4 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto img       = inputs[0];
+        auto boxes     = inputs[1];
+        auto box_ind   = inputs[2];
+        auto crop_size = inputs[3];
+        auto output    = outputs[0];
+        auto extrapolation = op->main_as_CropAndResize()->extrapolationValue();
+        auto method = op->main_as_CropAndResize()->method();
+        // resizeType of Interp : 1-NEAREST, 2-BILINEAR
+        const int resizeType = method == CropAndResizeMethod_BILINEAR ? 2 : 1;
+
+        int batch = img->length(0), ih = img->length(1), iw = img->length(2),
+                  depth = img->length(3), boxNum = boxes->length(0);
+        const int cropHeight = crop_size->host<uint32_t>()[0],
+                  cropWidth = crop_size->host<uint32_t>()[1];
+
+        auto des             = TensorUtils::getDescribe(output);
+        des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        des->regions.clear();
+        des->regions.reserve(boxNum);
+        for (int i = 0; i < boxNum; i++) {
+            const float y1 = boxes->host<float>()[i*4];
+            const float x1 = boxes->host<float>()[i*4+1];
+            const float y2 = boxes->host<float>()[i*4+2];
+            const float x2 = boxes->host<float>()[i*4+3];
+            const int ind = box_ind->host<uint32_t>()[i];
+            const float ch = (y2 - y1) * (ih - 1), cw = (x2 - x1) * (iw - 1);
+            const float yScale = ch / static_cast<float>(cropHeight - 1);
+            const float xScale = cw / static_cast<float>(cropWidth - 1);
+            const float yOffset = y1 * (ih - 1), xOffset = x1 * (iw - 1);
+            // select croped image from images, convert it's format from NHWC to NC4HW4
+            std::shared_ptr<Tensor> cropValue(new Tensor);
+            {
+                cropValue->buffer().type = halide_type_of<float>();
+                cropValue->buffer().dimensions = 4;
+                cropValue->setLength(0, 1);
+                cropValue->setLength(1, depth);
+                cropValue->setLength(2, ih);
+                cropValue->setLength(3, iw);
+                auto des             = TensorUtils::getDescribe(cropValue.get());
+                des->memoryType      = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+                des->regions.clear();
+                Tensor::InsideDescribe::Region region;
+                region.origin        = img;
+                region.size[1]       = depth;
+                region.size[2]       = ih * iw;
+                region.src.offset    = ind * ih * iw * depth;
+                region.dst.offset    = 0;
+                region.src.stride[1] = 1;
+                region.src.stride[2] = depth;
+                region.dst.stride[1] = ih * iw;
+                region.dst.stride[2] = 1;
+                des->regions.emplace_back(std::move(region));
+                res.extras.emplace_back(cropValue);
+            }
+            // using Interp Op deal with crop and resize for selected image
+            std::shared_ptr<Tensor> resizeValue;
+            {
+                resizeValue.reset(Tensor::createDevice<float>({1, depth, cropHeight, cropWidth}));
+                auto des             = TensorUtils::getDescribe(resizeValue.get());
+                des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+                std::unique_ptr<OpT> interp(new OpT);
+                interp->type                          = OpType_Interp;
+                interp->main.type                     = OpParameter_Interp;
+                interp->main.value                    = new InterpT;
+                interp->main.AsInterp()->widthScale   = xScale;
+                interp->main.AsInterp()->heightScale  = yScale;
+                interp->main.AsInterp()->widthOffset  = xOffset;
+                interp->main.AsInterp()->heightOffset = yOffset;
+                interp->main.AsInterp()->alignCorners = false;
+                interp->main.AsInterp()->resizeType   = resizeType;
+                auto cmd = GeometryComputerUtils::makeCommand(interp.get(), {cropValue.get()}, {resizeValue.get()});
+                res.extras.emplace_back(resizeValue);
+                res.command.emplace_back(cmd);
+            }
+            // convert resize image's format from NC4HW4 to NHWC, add it to output's batch
+            {
+                Tensor::InsideDescribe::Region region;
+                region.origin        = resizeValue.get();
+                region.size[1]       = cropHeight * cropWidth;
+                region.size[2]       = depth;
+                region.src.offset    = 0;
+                region.dst.offset    = i * cropHeight * cropWidth * depth;
+                region.src.stride[1] = 1;
+                region.src.stride[2] = cropHeight * cropWidth;
+                region.dst.stride[1] = depth;
+                region.dst.stride[2] = 1;
+                des->regions.emplace_back(std::move(region));
+            }
+        }
+
+        return true;
+    }
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        //return {false};
+        return {true};
+    }
+};
+
+static void _create() {
+    std::shared_ptr<GeometryComputer> comp(new GeometryCropAndResize);
+    // GeometryComputer::registerGeometryComputer(comp, {OpType_CropAndResize});
+}
+
+REGISTER_GEOMETRY(GeometryCropAndResize, _create);
+
+} // namespace MNN
--- a/backupcode/geometry/GeometryGather.cpp
+++ b/backupcode/geometry/GeometryGather.cpp
@ -0,0 +1,304 @@
+//
+//  GeometryGather.cpp
+//  MNN
+//
+//  Created by MNN on 2020/06/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+namespace MNN {
+
+class GeometryGather : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() == 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto embedding = inputs[0];
+        auto indices   = inputs[1];
+        auto output    = outputs[0];
+
+        const int firstDimStride = embedding->buffer().dim[0].stride;
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && firstDimStride != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        }
+        return std::vector<bool>(outputs.size(), false);
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(2 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto embedding = inputs[0];
+        auto indices   = inputs[1];
+        auto output    = outputs[0];
+
+        const int firstDimStride = embedding->buffer().dim[0].stride;
+        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || firstDimStride == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        auto bytes = embedding->buffer().type.bytes();
+
+        const size_t indicesCount = indices->elementSize();
+        const auto limit          = embedding->length(0);
+        const int* indicesData    = indices->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < indicesCount; i++) {
+            if (indicesData[i] < 0 || indicesData[i] > limit) {
+                MNN_PRINT("Gather indice error\n");
+                return false;
+            }
+
+            Tensor::InsideDescribe::Region slice;
+            slice.origin        = embedding;
+            slice.size[0]       = 1;
+            slice.size[1]       = 1;
+            slice.size[2]       = firstDimStride;
+            slice.src.offset    = firstDimStride * indicesData[i];
+            slice.dst.offset    = i * firstDimStride;
+            slice.src.stride[0] = 1;
+            slice.src.stride[1] = 1;
+            slice.src.stride[2] = 1;
+            slice.dst.stride[0] = 1;
+            slice.dst.stride[1] = 1;
+            slice.dst.stride[2] = 1;
+            outputDes->regions.emplace_back(std::move(slice));
+        }
+        return true;
+    }
+};
+
+class GeometryGatherND : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() == 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int mSliceN    = 1;
+        int mSliceSize = 1;
+        for (int i = 0; i < indices->dimensions() - 1; ++i) {
+            mSliceN *= indices->length(i);
+        }
+        auto indiceNd = indices->length(indices->dimensions() - 1);
+        std::vector<int> mDimsToCount;
+        mDimsToCount.resize(indiceNd);
+        for (int i = indiceNd; i < params->dimensions(); ++i) {
+            mSliceSize *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && mSliceSize != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        } else {
+            std::vector<bool> res(outputs.size(), false);
+            return res;
+        }
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(2 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+        auto params = inputs[0];
+        auto indice = inputs[1];
+        auto output = outputs[0];
+
+        int mSliceN    = 1;
+        int mSliceSize = 1;
+        for (int i = 0; i < indice->dimensions() - 1; ++i) {
+            mSliceN *= indice->length(i);
+        }
+        auto indiceNd = indice->length(indice->dimensions() - 1);
+        std::vector<int> mDimsToCount;
+        mDimsToCount.resize(indiceNd);
+        for (int i = indiceNd; i < params->dimensions(); ++i) {
+            mSliceSize *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indice)->usage != MNN::Tensor::InsideDescribe::CONSTANT || mSliceSize == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        auto paramSize = params->elementSize();
+        for (int i = 0; i < indiceNd; ++i) {
+            mDimsToCount[i] = paramSize / params->length(i);
+            paramSize       = mDimsToCount[i];
+        }
+        mDimsToCount.resize(indiceNd);
+        auto indiceData = indice->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int i = 0; i < mSliceN; i++) {
+            int fromPos = 0;
+            for (int j = 0; j < indiceNd; ++j) {
+                fromPos += mDimsToCount[j] * indiceData[i * indiceNd + j];
+            }
+
+            Tensor::InsideDescribe::Region slice;
+            slice.origin        = params;
+            slice.size[0]       = 1;
+            slice.size[1]       = 1;
+            slice.size[2]       = mSliceSize;
+            slice.src.offset    = fromPos;
+            slice.dst.offset    = i * mSliceSize;
+            slice.src.stride[0] = 1;
+            slice.src.stride[1] = 1;
+            slice.src.stride[2] = 1;
+            slice.dst.stride[0] = 1;
+            slice.dst.stride[1] = 1;
+            slice.dst.stride[2] = 1;
+            outputDes->regions.emplace_back(std::move(slice));
+        }
+        return true;
+    }
+};
+
+class GeometryGatherV2 : public DefaultGeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        MNN_ASSERT(inputs.size() >= 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int axis = 0;
+        if (inputs.size() == 3) {
+            const Tensor* axisTensor = inputs[2];
+            axis                     = axisTensor->host<int32_t>()[0];
+        }
+
+        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
+
+        if (axis < 0) {
+            axis = params->buffer().dimensions + axis;
+        }
+        const int gatherDimSize = params->buffer().dim[axis].extent;
+        const int N             = indices->elementSize();
+        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
+
+        int inside = 1;
+        for (int i = axis + 1; i < params->dimensions(); ++i) {
+            inside *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage == MNN::Tensor::InsideDescribe::CONSTANT && inside != 0) {
+            std::vector<bool> res(outputs.size(), true);
+            return res;
+        }
+        return std::vector<bool>(outputs.size(), false);
+    }
+
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                           Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(inputs.size() >= 2);
+        MNN_ASSERT(1 == outputs.size());
+        auto params  = inputs[0];
+        auto indices = inputs[1];
+        auto output  = outputs[0];
+
+        int axis = 0;
+        if (inputs.size() == 3) {
+            const Tensor* axisTensor = inputs[2];
+            axis                     = axisTensor->host<int32_t>()[0];
+        }
+        MNN_ASSERT(axis > -params->buffer().dimensions && axis < params->buffer().dimensions);
+
+        if (axis < 0) {
+            axis = params->buffer().dimensions + axis;
+        }
+        const int gatherDimSize = params->buffer().dim[axis].extent;
+        const int N             = indices->elementSize();
+        MNN_ASSERT(gatherDimSize <= std::numeric_limits<int32_t>::max());
+
+        int inside  = 1;
+        int outside = 1;
+        for (int i = 0; i < axis; ++i) {
+            outside *= params->length(i);
+        }
+        for (int i = axis + 1; i < params->dimensions(); ++i) {
+            inside *= params->length(i);
+        }
+
+        if (TensorUtils::getDescribe(indices)->usage != MNN::Tensor::InsideDescribe::CONSTANT || inside == 0) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+
+        const int limit               = params->length(axis);
+        auto bytes                    = output->buffer().type.bytes();
+        const int insideStride        = inside;
+        const int outputOutsideStride = inside * N;
+        const int inputOutsideStride  = inside * inputs[0]->length(axis);
+        const int* indicesPtr         = indices->host<int32_t>();
+
+        auto outputDes = TensorUtils::getDescribe(output);
+        outputDes->regions.clear();
+        outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+        for (int o = 0; o < outside; ++o) {
+            for (int i = 0; i < N; i++) {
+                if (indicesPtr[i] < 0 || indicesPtr[i] > limit) {
+                    continue;
+                }
+                Tensor::InsideDescribe::Region slice;
+                slice.origin        = params;
+                slice.size[0]       = 1;
+                slice.size[1]       = 1;
+                slice.size[2]       = insideStride;
+                slice.src.offset    = inputOutsideStride * o + insideStride * indicesPtr[i];
+                slice.dst.offset    = outputOutsideStride * o + i * insideStride;
+                slice.src.stride[0] = 1;
+                slice.src.stride[1] = 1;
+                slice.src.stride[2] = 1;
+                slice.dst.stride[0] = 1;
+                slice.dst.stride[1] = 1;
+                slice.dst.stride[2] = 1;
+                outputDes->regions.emplace_back(std::move(slice));
+            }
+        }
+        return true;
+    }
+};
+
+static void _create() {
+//    std::shared_ptr<GeometryComputer> comp(new GeometryGather);
+//    GeometryComputer::registerGeometryComputer(comp, {OpType_Gather});
+//
+//    std::shared_ptr<GeometryComputer> comp2(new GeometryGatherND);
+//    GeometryComputer::registerGeometryComputer(comp2, {OpType_GatherND});
+//
+//    std::shared_ptr<GeometryComputer> comp3(new GeometryGatherV2);
+//    GeometryComputer::registerGeometryComputer(comp3, {OpType_GatherV2});
+}
+
+REGISTER_GEOMETRY(GeometryGather, _create);
+
+} // namespace MNN
--- a/backupcode/geometry/GeometrySoftmax.cpp
+++ b/backupcode/geometry/GeometrySoftmax.cpp
@ -0,0 +1,214 @@
+//
+//  GeometrySoftmax.cpp
+//  MNN
+//
+//  Created by MNN on 2020/06/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
+#include "geometry/GeometryComputerUtils.hpp"
+
+namespace MNN {
+class GeometrySoftmax : public GeometryComputer {
+public:
+    virtual std::vector<bool> onGetOutputVirtual(const Op* op, const std::vector<Tensor*>& inputs,
+                                                 const std::vector<Tensor*>& outputs) const override {
+        auto  axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        
+        if (axis == 1) {
+            return std::vector<bool>(outputs.size(), false);
+        }
+        return std::vector<bool>(outputs.size(), true);
+    }
+    
+    virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs,
+                                    const std::vector<Tensor*>& outputs, Context& context, CommandBuffer& res) const override {
+        MNN_ASSERT(1 == inputs.size());
+        MNN_ASSERT(1 == outputs.size());
+
+        auto input     = inputs[0];
+        auto output    = outputs[0];
+        auto dims      = input->buffer().dimensions;
+        
+        auto  axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = inputs[0]->dimensions() + axis;
+        }
+        
+        if (axis == 1) {
+            Command cmd;
+            cmd.op      = op;
+            cmd.inputs  = std::move(inputs);
+            cmd.outputs = std::move(outputs);
+            res.command.emplace_back(std::move(cmd));
+            return true;
+        }
+        
+        int inside  = 1;
+        int outside = 1;
+        int channel = 1;
+        for (int i = 0; i < axis; ++i) {
+            outside *= input->length(i);
+        }
+        channel = input->length(axis);
+        for (int i = axis + 1; i < dims; ++i) {
+            inside *= input->length(i);
+        }
+
+        //input transform to NCHW format
+        std::shared_ptr<Tensor> tmpInput;
+        {
+            tmpInput.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(tmpInput.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = channel*inside;
+            desReg.src.stride[1] = inside;
+            desReg.src.stride[2] = 1;
+            desReg.origin = input;
+            outputDes->regions.emplace_back(std::move(desReg));
+            
+            res.extras.emplace_back(tmpInput);
+        }
+        
+        //reduction max, axis=1
+        std::shared_ptr<Tensor> maxValue;
+        {
+            maxValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
+            res.extras.emplace_back(maxValue);
+            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_MAXIMUM, tmpInput.get(), maxValue.get()));
+        }
+        
+        //broadcast reduction axis dim
+        std::shared_ptr<Tensor> maxBroadValue;
+        {
+            maxBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(maxBroadValue.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = inside;
+            desReg.src.stride[1] = 0;
+            desReg.src.stride[2] = 1;
+            desReg.origin = maxValue.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+            
+            res.extras.emplace_back(maxBroadValue);
+        }
+
+        //sub
+        std::shared_ptr<Tensor> subMaxValue;
+        {
+            subMaxValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_SUB, tmpInput.get(), maxBroadValue.get(), subMaxValue.get());
+            res.extras.emplace_back(subMaxValue);
+            res.command.emplace_back(std::move(cmd));
+        }
+        //exp
+        std::shared_ptr<Tensor> expValue;
+        {
+            expValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_EXP, subMaxValue.get(), expValue.get());
+            res.extras.emplace_back(expValue);
+            res.command.emplace_back(std::move(cmd));
+            
+        }
+        
+        //reduction sum, axis=2, only support NCHW
+        std::shared_ptr<Tensor> sumValue;
+        {
+            sumValue.reset(Tensor::createDevice<float>({outside, 1, inside}));
+            res.extras.emplace_back(sumValue);
+            res.command.emplace_back(GeometryComputerUtils::makeReduce(ReductionType_SUM, expValue.get(), sumValue.get()));
+        }
+        
+        //broadcast reduction axis dim
+        std::shared_ptr<Tensor> sumBroadValue;
+        {
+            sumBroadValue.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto outputDes = TensorUtils::getDescribe(sumBroadValue.get());
+            outputDes->regions.clear();
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = inside;
+            desReg.src.stride[1] = 0;
+            desReg.src.stride[2] = 1;
+            desReg.origin = sumValue.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+
+            res.extras.emplace_back(sumBroadValue);
+        }
+
+        //div
+        std::shared_ptr<Tensor> tmpOutput;
+        {
+            tmpOutput.reset(Tensor::createDevice<float>({outside, channel, inside}));
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_REALDIV, expValue.get(), sumBroadValue.get(), tmpOutput.get());
+            res.extras.emplace_back(tmpOutput);
+            res.command.emplace_back(std::move(cmd));
+        }
+
+        //transform to output
+        {
+            auto outputDes = TensorUtils::getDescribe(output);
+            outputDes->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            Tensor::InsideDescribe::Region desReg;
+            desReg.size[0] = outside;
+            desReg.size[1] = channel;
+            desReg.size[2] = inside;
+            desReg.dst.offset = 0;
+            desReg.dst.stride[0] = channel*inside;
+            desReg.dst.stride[1] = inside;
+            desReg.dst.stride[2] = 1;
+            desReg.src.offset = 0;
+            desReg.src.stride[0] = channel*inside;
+            desReg.src.stride[1] = inside;
+            desReg.src.stride[2] = 1;
+            desReg.origin = tmpOutput.get();
+            outputDes->regions.emplace_back(std::move(desReg));
+        }
+        return true;
+    }
+};
+
+static void _create() {
+//    std::shared_ptr<GeometryComputer> comp(new GeometrySoftmax);
+//    GeometryComputer::registerGeometryComputer(comp, {OpType_Softmax});
+}
+
+REGISTER_GEOMETRY(GeometrySoftmax, _create);
+
+} // namespace MNN
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@ -7,7 +7,7 @@ add_executable(benchmarkExprModels.out ${CMAKE_CURRENT_LIST_DIR}/benchmarkExprMo
 target_include_directories(benchmarkExprModels.out PRIVATE "${CMAKE_CURRENT_LIST_DIR}/exprModels" ${CMAKE_CURRENT_SOURCE_DIR}/)
 target_link_libraries(benchmarkExprModels.out ${MNN_DEPS})
  
-if ((MSVC OR WIN32) AND NOT MNN_BUILD_SHARED_LIBS)
+if (MSVC AND NOT MNN_BUILD_SHARED_LIBS)
  foreach (DEPEND ${MNN_DEPS})
    target_link_options(benchmark.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
    target_link_options(benchmarkExprModels.out PRIVATE /WHOLEARCHIVE:$<TARGET_FILE:${DEPEND}>)
--- a/benchmark/benchmark.cpp
+++ b/benchmark/benchmark.cpp
@ -124,6 +124,7 @@ std::vector<float> doBench(Model& model, int loop, int warmup = 10, int forward
    const auto bufferSize = revertor->getBufferSize();
    auto net = std::shared_ptr<MNN::Interpreter>(MNN::Interpreter::createFromBuffer(modelBuffer, bufferSize));
    revertor.reset();
+    net->setSessionMode(MNN::Interpreter::Session_Release);
    MNN::ScheduleConfig config;
    config.numThread = numberThread;
    config.type      = static_cast<MNNForwardType>(forward);
--- a/benchmark/benchmarkExprModels.cpp
+++ b/benchmark/benchmarkExprModels.cpp
@ -90,6 +90,7 @@ static std::vector<float> runNet(VARP netOutput, const ScheduleConfig& config, i
    const void* buf = builder.GetBufferPointer();
    size_t size = builder.GetSize();
    std::unique_ptr<Interpreter> net(Interpreter::createFromBuffer(buf, size));
+    net->setSessionMode(MNN::Interpreter::Session_Release);
    auto session = net->createSession(config);
    net->releaseModel();
    auto inputTensor = net->getSessionInput(session, NULL);
--- a/benchmark/opencl_codegen.py
+++ b/benchmark/opencl_codegen.py
@ -1,84 +0,0 @@
-import os
-import sys
-major_py_ver = sys.version_info.major
-
-def convert_string_to_hex_list(code_str):
-    hex_list = []
-    for i in range(len(code_str)):
-        hex_ = hex(ord(code_str[i]))
-        hex_list.append(hex_)
-    return hex_list
-
-def opencl_codegen():
-    cl_kernel_dir = sys.argv[1]
-    output_path = sys.argv[2]
-    print("Generating OpenCL Kernels in "+cl_kernel_dir+" to "+output_path)
-    if not os.path.exists(cl_kernel_dir):
-        print(cl_kernel_dir + " doesn't exist!")
-
-#common.h
-    common_header_code = ""
-#quantized_common.h
-    quantized_common_header_code = ""
-#activation_common.h
-    activation_common_header_code = ""
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-2:] == ".h" and file_name[:-2] == "quantized_common":
-            with open(file_path, "r") as f:
-                quantized_common_header_code += f.read()
-        elif file_path[-2:] == ".h" and file_name[:-2] == "activation_common":
-            with open(file_path, "r") as f:
-                activation_common_header_code += f.read()
-
-    opencl_code_maps = {}
-    for file_name in os.listdir(cl_kernel_dir):
-        file_path = os.path.join(cl_kernel_dir, file_name)
-        if file_path[-3:] == ".cl":
-            with open(file_path, "r") as f:
-                code_str = ""
-                for line in f.readlines():
-                    if "#include <activation_common.h>" in line:
-                        code_str += common_header_code
-                        code_str += activation_common_header_code
-                    elif "#include <quantized_common.h>" in line:
-                        code_str += common_header_code
-                        code_str += quantized_common_header_code
-                    elif "#include <common.h>" in line:
-                        code_str += common_header_code
-                    else:
-                        code_str += line
-                opencl_code_maps[file_name[:-3]] = convert_string_to_hex_list(code_str)
-
-#source model
-    opencl_source_map = "#include <map> \n"
-    opencl_source_map += "#include <string> \n"
-    opencl_source_map += "#include <vector> \n"
-    opencl_source_map += "namespace MNN { \n"
-    opencl_source_map += "extern const std::map<std::string, std::vector<unsigned char>> OpenCLProgramMap = \n { \n"
-
-    if major_py_ver == 2:
-        items = opencl_code_maps.iteritems()
-    else:
-        items = opencl_code_maps.items()
-    for file_name, file_source in items:
-        opencl_source_map += "{\n \""
-        opencl_source_map += file_name
-        opencl_source_map += "\", \n"
-        opencl_source_map += "     { "
-        for source_hex in file_source:
-            opencl_source_map += source_hex
-            opencl_source_map += ","
-        opencl_source_map += " } "
-        opencl_source_map += "\n }, \n"
-
-    opencl_source_map += " }; \n"
-    opencl_source_map += "} \n"
-
-    with open(output_path, "w") as w_file:
-        w_file.write(opencl_source_map)
-
-    print("Generate OpenCL Source done !!! \n")
-
-if __name__ == '__main__':
-    opencl_codegen()
--- a/ciscripts/build.sh
+++ b/ciscripts/build.sh
@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+. ./parse_options.sh || exit 1;
+
+CMAKE=cmake
+MAKE=make
+ANDROID_NDK=/home/android-ndk-r18b
+
+BUILD_ROOT=`pwd`
+
+# Clean the exist directory other than remove it in order to solve
+# the problem "Current working directory cannot be established".
+function make_or_clean_dir {
+  if [ -d $1 ]; then
+    rm -rf $1/*
+  else
+    mkdir $1
+  fi
+}
+
+function build_arm_android_32 {
+  make_or_clean_dir build_arm_android_32 && cd build_arm_android_32
+  $CMAKE ../.. \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_ABI="armeabi-v7a" \
+      -DANDROID_STL=c++_static \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_NATIVE_API_LEVEL=android-21 \
+      -DANDROID_TOOLCHAIN=clang \
+      -DMNN_USE_LOGCAT=true \
+      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+      -DNATIVE_LIBRARY_OUTPUT=. \
+      -DNATIVE_INCLUDE_OUTPUT=. \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_OPENGL=$USE_OPENGL \
+      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
+  $MAKE -j $build_threads  || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_android_64 {
+  make_or_clean_dir build_arm_android_64 && cd build_arm_android_64
+  $CMAKE ../.. \
+      -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_ABI="arm64-v8a" \
+      -DANDROID_STL=c++_static \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DANDROID_NATIVE_API_LEVEL=android-21 \
+      -DANDROID_TOOLCHAIN=clang \
+      -DMNN_USE_LOGCAT=true \
+      -DMNN_BUILD_FOR_ANDROID_COMMAND=true \
+      -DNATIVE_LIBRARY_OUTPUT=. \
+      -DNATIVE_INCLUDE_OUTPUT=. \
+      -DMNN_ARM82=ON \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_OPENGL=$USE_OPENGL \
+      -DMNN_USE_THREAD_POOL=$USE_THREAD_POOL || exit 1;
+  $MAKE -j $build_threads || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_linux_32 {
+  cd $BUILD_ROOT; true;
+}
+
+function build_arm_linux_64 {
+  cd $BUILD_ROOT; true;
+}
+
+function build_x86_linux {
+  make_or_clean_dir build_x86_linux && cd build_x86_linux
+  $CMAKE ../.. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DMNN_BUILD_TRAIN=ON \
+      -DMNN_SEP_BUILD=OFF \
+      -DMNN_BUILD_DEMO=ON \
+      -DMNN_BUILD_QUANTOOLS=ON \
+      -DMNN_EVALUATION=ON \
+      -DMNN_BUILD_CONVERTER=ON \
+      -DMNN_SUPPORT_TFLITE_QUAN=ON \
+      -DMNN_BUILD_TEST=ON \
+      -DMNN_OPENCL=$USE_OPENCL \
+      -DMNN_VULKAN=$USE_VULKAN \
+      -DMNN_OPENMP=$USE_OPENMP \
+      -DMNN_USE_THREAD_POOL=OFF \
+      -DMNN_BUILD_BENCHMARK=ON  || exit 1;
+  $MAKE -j $build_threads || exit 1;
+  cd $BUILD_ROOT; true;
+}
+
+function build_all {
+  build_arm_android_32 || exit 1;
+  build_arm_android_64 || exit 1;
+  build_arm_linux_32 || exit 1;
+  build_arm_linux_64 || exit 1;
+  build_x86_linux || exit 1;
+  true;
+}
+
+function clean {
+  rm -rf build_arm_android_32
+  rm -rf build_arm_android_64
+  rm -rf build_arm_linux_32
+  rm -rf build_arm_linux_64
+  rm -rf build_x86_linux
+}
+
+function build {
+  case $platform in
+    "arm_linux_32")
+      build_arm_linux_32 || exit 1;
+      ;;
+    "arm_linux_64")
+      build_arm_linux_64 || exit 1;
+      ;;
+    "x86_linux")
+      build_x86_linux || exit 1;
+      ;;
+    "arm_android_32")
+      build_arm_android_32 || exit 1;
+      ;;
+    "arm_android_64")
+      build_arm_android_64 || exit 1;
+      ;;
+    "all")
+      build_all || exit 1;
+      ;;
+  *) echo "Invalid platform: $platform" && exit 1;
+  esac
+}
+
+if [ $clean == 1 ]; then
+  clean
+else
+  build $@
+fi
+true;
--- a/ciscripts/parse_options.sh
+++ b/ciscripts/parse_options.sh
@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Valid platform:
+#   - arm_android_32
+#   - arm_android_64
+#   - arm_linux_32
+#   - arm_linux_64
+#   - x86_linux
+platform="all"
+
+# Option to build with opencl.
+use_opencl=0
+
+# Option to build with opengl.
+use_opengl=0
+
+# Option to build with vulkan.
+use_vulkan=0
+
+# Option to build with openmp multithreads library.
+use_openmp=0
+
+build_threads=1
+
+# Option to clear the build history.
+clean=0
+
+USE_OPENCL=OFF
+USE_VULKAN=OFF
+USE_OPENGL=OFF
+USE_OPENMP=OFF
+USE_THREAD_POOL=ON
+
+function print_usage {
+  echo -e "Usgae: ./build.sh"
+  echo -e "  --platform=x: Specify build platform x. "
+  echo -e "      All valid platforms are \"arm_android_32\", \"arm_android_64\",
+                \"arm_linux_32\", \"arm_linux_64\", \"x86_linux\", \"all\"."
+  echo -e "      The default is \"all\"."
+  echo -e "  --use_openmp=true|false: Build with openmp or not."
+  echo -e "      The default is false."
+  echo -e "  --use_opencl=true|false: Build with opencl or not."
+  echo -e "      The default is false."
+  echo -e "  --use_opengl=true|false: Build with opengl or not."
+  echo -e "      The default is false."
+  echo -e "  --use_vulkan=true|false: Build with vulkan or not."
+  echo -e "      The default is false."
+  echo -e "  --job=n: Build with n threads. Default is 1."
+}
+
+function parse_platform {
+  platform=`echo "$1" | awk -F '=' '{print $2}'`
+}
+
+function parse_nthreads {
+  build_threads=`echo "$1" | awk -F '=' '{print $2}'`
+}
+
+function parse_bool {
+  val=`echo "$1" | awk -F '=' '{print $2}'`
+  if [ $val == "true" ] || [ $val == "1" ]; then
+    return 1;
+  else
+    return 0;
+  fi
+}
+
+[ -z "${1:-}" ] && print_usage && exit 1;
+
+while true; do
+  [ -z "${1:-}" ] && break;
+  case "$1" in
+    --platform=*) parse_platform "$1"; shift 1;
+      ;;
+    --use_openmp=*) parse_bool "$1"; use_openmp=$?; shift 1;
+      ;;
+    --use_openmp) use_openmp=true; shift 1;
+      ;;
+    --use_opencl=*) parse_bool "$1"; use_opencl=$?; shift 1;
+      ;;
+    --use_opencl) use_opencl=true; shift 1;
+      ;;
+    --use_opengl=*) parse_bool "$1"; use_opengl=$?; shift 1;
+      ;;
+    --use_opengl) use_opengl=true; shift 1;
+      ;;
+    --use_vulkan=*) parse_bool "$1"; use_vulkan=$?; shift 1;
+      ;;
+    --use_vulkan) use_vulkan=true; shift 1;
+      ;;
+    --job=*) parse_nthreads "$1"; shift 1;
+      ;;
+    clean) clean=1; shift 1;
+      ;;
+    *) break;
+  esac
+done
+
+if [ $use_opencl == 1 ]; then
+  USE_OPENCL=ON
+fi
+if [ $use_opengl == 1 ]; then
+  USE_OPENGL=ON
+fi
+if [ $use_vulkan == 1 ]; then
+  USE_VULKAN=ON
+fi
+if [ $use_openmp == 1 ]; then
+  USE_OPENMP=ON
+  USE_THREAD_POOL=OFF
+fi
+
+true;
--- a/cmake/windows_x64_travis.bat
+++ b/cmake/windows_x64_travis.bat
@ -0,0 +1,3 @@
+call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars64.bat"
+cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
+ninja
--- a/cmake/windows_x86_travis.bat
+++ b/cmake/windows_x86_travis.bat
@ -0,0 +1,3 @@
+call "C:/Program Files (x86)/Microsoft Visual Studio/2017/BuildTools/VC/Auxiliary/Build/vcvars32.bat"
+cmake -G "Ninja" -DCMAKE_BUILD_TYPE=Release ..
+ninja
--- a/demo/exec/CMakeLists.txt
+++ b/demo/exec/CMakeLists.txt
@ -12,3 +12,9 @@ target_link_libraries(segment.out ${MNN_DEPS})

 add_executable(expressDemo.out ${CMAKE_CURRENT_LIST_DIR}/expressDemo.cpp)
 target_link_libraries(expressDemo.out ${MNN_DEPS})
+
+add_executable(transformerDemo.out ${CMAKE_CURRENT_LIST_DIR}/transformerDemo.cpp)
+target_link_libraries(transformerDemo.out ${MNN_DEPS})
+
+add_executable(rasterDemo.out ${CMAKE_CURRENT_LIST_DIR}/rasterDemo.cpp)
+target_link_libraries(rasterDemo.out ${MNN_DEPS})
--- a/demo/exec/expressDemo.cpp
+++ b/demo/exec/expressDemo.cpp
@ -53,7 +53,6 @@ int main(int argc, const char* argv[]) {
        MNN_ERROR("Output Not valid\n");
        return 0;
    }
-    auto size = outputInfo->size;
    //Test Speed
    if (testTime > 0){
        //Let the frequence up
@ -82,6 +81,7 @@ int main(int argc, const char* argv[]) {
    }

    {
+        auto size = outputInfo->size;
        auto outputPtr = output->readMap<float>();
        if (nullptr == outputPtr) {
            MNN_ERROR("Output Not valid read error\n");
--- a/demo/exec/rasterDemo.cpp
+++ b/demo/exec/rasterDemo.cpp
@ -0,0 +1,251 @@
+//
+//  rasterDemo.cpp
+//  MNN
+//
+//  Created by MNN on 2020/10/14.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <chrono>
+#include <MNN/MNNDefine.h>
+#include <MNN/Tensor.hpp>
+#include <MNN/Interpreter.hpp>
+#include "MNN_generated.h"
+#include "core/TensorUtils.hpp"
+#include "core/Execution.hpp"
+#include "core/Backend.hpp"
+#include "rapidjson/document.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/writer.h"
+using namespace MNN;
+/*
+ 1.Raster will do the index mapping like below:
+
+    for (region : regions)
+        src = region.src, dst = region.dst;
+    for (i = 0 -> size[0])
+    for (j = 0 -> size[1])
+    for (k = 0 -> size[2])
+        output[dst.offset + i * dst.stride[0] + j * dst.stride[1] + k * dst.stride[2]] =
+        region.origion[src.offset + i * src.stride[0] + j * src.stride[1] + k * src.stride[2]];
+
+ 2. Raster Op has a input and a output, but the input is not the real input tensor, it's a
+    middle tensor whith VIRTUAL type that has many regions point to inputs tensors, like below.
+
+                input_0 --> region_0 --\
+                                        \
+                input_1 --> region_1 ---- middle ----> output
+                                        /
+                input_2 --> region_2 --/
+
+ 3. This example read a json file and construct some Rasters and compute.
+    The input json file format is as below:
+    {
+       "inputs" : [
+           {
+               "id" : int,
+               "type" : "type_name", // float or int
+               "dims" : [int],
+               "data" : [int/float] // if null, fill with random number
+           }
+       ],
+       "outputs" : [
+           // same with inputs
+       ],
+       "regions" : [
+           {
+               "id" : int, // points to outputs
+               "size" : [int],
+               "src" : {
+                   "offset" : int,
+                   "stride" : [int]
+               },
+               "dst" : { // same with src },
+               "origin" : int // point to inputs
+           }
+       ]
+    }
+ */
+
+static std::string runRaster(std::string jsonString, int runNum) {
+    srand(0);
+    rapidjson::Document document;
+    document.Parse(jsonString.c_str());
+    if (document.HasParseError()) {
+        MNN_ERROR("Invalid Json Format!\n");
+        return 0;
+    }
+
+    // prepare CPU backend
+    ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    BackendConfig backendConfig;
+    backendConfig.precision = BackendConfig::Precision_High;
+    config.backendConfig = &backendConfig;
+    Backend::Info compute;
+    compute.type = config.type;
+    compute.numThread = config.numThread;
+    compute.user = config.backendConfig;
+    const RuntimeCreator* runtimeCreator(MNNGetExtraRuntimeCreator(compute.type));
+    std::unique_ptr<Runtime> runtime(runtimeCreator->onCreate(compute));
+    std::unique_ptr<Backend> backend(runtime->onCreate());
+
+    // build Op
+    std::unique_ptr<OpT> opt(new OpT);
+    opt->type = OpType_Raster;
+    flatbuffers::FlatBufferBuilder builder(1024);
+    builder.ForceDefaults(true);
+    auto len = Op::Pack(builder, opt.get());
+    builder.Finish(len);
+    auto buffer = builder.GetBufferPointer();
+    const Op* op = flatbuffers::GetMutableRoot<Op>(buffer);
+    // build tensors (NCHW) from json
+    std::vector<std::unique_ptr<Tensor>> inputs;
+    std::vector<std::unique_ptr<Tensor>> outputs;
+    auto readTensors = [&document, &backend](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
+        if (document.HasMember(type)) {
+            auto info = document[type].GetArray();
+            tensors.resize(info.Size());
+            for (auto iter = info.begin(); iter != info.end(); iter++) {
+                auto obj = iter->GetObject();
+                int id = obj["id"].GetInt();
+                tensors[id].reset(new Tensor(4));
+                auto tensor = tensors[id].get();
+                auto dataType = obj["type"].GetString();
+                bool isFloat = !strcmp(dataType, "float");
+                tensor->setType(isFloat ? DataType_DT_FLOAT : DataType_DT_INT32);
+                auto dims = obj["dims"].GetArray();
+                for (auto d = dims.begin(); d != dims.end(); d++) {
+                    tensor->setLength(d - dims.begin(), d->GetInt());
+                }
+                TensorUtils::setLinearLayout(tensor);
+                backend->onAcquireBuffer(tensor, Backend::STATIC);
+                TensorUtils::getDescribe(tensor)->backend = backend.get();
+                auto data = obj["data"].GetArray();
+                if (!strcmp(type, "inputs")) {
+                    bool hasData = data.Size() == tensor->elementSize();
+                    auto dataIter = data.begin();
+                    for (int i = 0; i < tensor->elementSize(); i++, dataIter++) {
+                        if (isFloat) {
+                            tensor->host<float>()[i] = hasData ? dataIter->GetFloat() : rand() % 10 / 10.0;
+                        } else {
+                            tensor->host<int>()[i] = hasData ? dataIter->GetInt() : rand() % 10;
+                        }
+                    }
+                }
+            }
+        }
+    };
+    readTensors(inputs, "inputs");
+    readTensors(outputs, "outputs");
+
+    // build middle tensors' region info from json
+    std::vector<std::unique_ptr<Tensor>> middles;
+    middles.resize(outputs.size());
+    if (document.HasMember("regions")) {
+        auto info = document["regions"].GetArray();
+        for (auto iter = info.begin(); iter != info.end(); iter++) {
+            auto obj = iter->GetObject();
+            int id = obj["id"].GetInt();
+            if (middles[id] == nullptr) {
+                middles[id].reset(new Tensor(4));
+            }
+            auto des = TensorUtils::getDescribe(middles[id].get());
+            des->memoryType = MNN::Tensor::InsideDescribe::MEMORY_VIRTUAL;
+            Tensor::InsideDescribe::Region region;
+            int origin = obj["origin"].GetInt();
+            region.origin = inputs[origin].get();
+            auto size = obj["size"].GetArray();
+            auto src = obj["src"].GetObject();
+            auto dst = obj["dst"].GetObject();
+            auto srcStride = src["stride"].GetArray();
+            auto dstStride = dst["stride"].GetArray();
+            for (int i = 0; i < 3; i++) {
+                region.size[i] = size[i].GetInt();
+                region.src.stride[i] = srcStride[i].GetInt();
+                region.dst.stride[i] = dstStride[i].GetInt();
+            }
+            region.src.offset = src["offset"].GetInt();
+            region.dst.offset = dst["offset"].GetInt();
+            des->regions.push_back(region);
+        }
+    }
+
+    // build execution of Raster and run them
+    for (int i = 0; i < outputs.size(); i++) {
+        std::vector<Tensor*> ins = {middles[i].get()}, outs = {outputs[i].get()};
+        std::unique_ptr<Execution> exe(backend->onCreate(ins, outs, op));
+        exe->onResize(ins, outs);
+        auto t1 = std::chrono::high_resolution_clock::now();
+        for (int j = 0; j < runNum; j++) {
+            exe->onExecute(ins, outs);
+        }
+        auto t2 = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+        double time = time_span.count() * 1000.0 / runNum;
+        printf("For output_id = %d, run %d times, the average time is %f ms.\n", i, runNum, time);
+    }
+
+    auto writeTensors = [&document](std::vector<std::unique_ptr<Tensor>>& tensors, const char* type) {
+        auto info = document[type].GetArray();
+        for (auto iter = info.begin(); iter != info.end(); iter++) {
+            auto obj = iter->GetObject();
+            int id = obj["id"].GetInt();
+            auto data = obj["data"].GetArray();
+            if (data.Size() == tensors[id]->elementSize()) {
+                // has data, dont write
+                return;
+            }
+            bool isFloat = !strcmp(obj["type"].GetString(), "float");
+            data.Reserve(tensors[id]->elementSize(), document.GetAllocator());
+            for (int i = 0; i < tensors[id]->elementSize(); i++) {
+                if (isFloat) {
+                    data.PushBack(tensors[id]->host<float>()[i], document.GetAllocator());
+                } else {
+                    data.PushBack(tensors[id]->host<int>()[i], document.GetAllocator());
+                }
+            }
+        }
+    };
+    writeTensors(inputs, "inputs");
+    writeTensors(outputs, "outputs");
+    rapidjson::StringBuffer stringBuffer;
+    rapidjson::Writer<rapidjson::StringBuffer> writer(stringBuffer);
+    document.Accept(writer);
+    return stringBuffer.GetString();
+}
+
+int main(int argc, const char* argv[]) {
+    if (argc < 2) {
+        printf("Usage: ./rasterDemo.out input.json [output.json] [runNum]\ndefault output is input, and default runNum is 100.\n");
+        return 0;
+    }
+    const char* inputFile = argv[1];
+    const char* outputFile = argv[1];
+    int runNum = 100;
+    if (argc >= 3) {
+        outputFile = argv[2];
+    }
+    if (argc >= 4) {
+        runNum = ::atoi(argv[3]);
+    }
+    std::ifstream in(inputFile);
+    if (in.fail()) {
+        printf("Invalid input Json File!\n");
+        return 0;
+    }
+    std::ofstream out(outputFile);
+    if (out.fail()) {
+        printf("Invalid output Json File!\n");
+        return 0;
+    }
+    std::stringstream ss;
+    ss << in.rdbuf();
+    out << runRaster(ss.str(), runNum);
+    out.close();
+    printf("Run Raster Done!\n");
+    return 0;
+}
--- a/demo/exec/transformerDemo.cpp
+++ b/demo/exec/transformerDemo.cpp
@ -0,0 +1,60 @@
+#include <MNN/expr/Module.hpp>
+#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/expr/Executor.hpp>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+#include<string.h>
+using namespace MNN::Express;
+using namespace MNN;
+using namespace std;
+
+int main(int argc, const char* argv[]) {
+    if (argc < 2) {
+        MNN_ERROR("Don't has model name\n");
+        return 0;
+    }
+    BackendConfig config;
+    //Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 4);
+    auto modelName = argv[1];
+    std::shared_ptr<Module> model;
+    model.reset(Module::load({"NmtModel/Placeholder", "NmtModel/Placeholder_1"}, {"NmtModel/transpose_2"}, modelName));
+    std::vector<int> input0 = {32,16,234,3215,61,135,29,10,24317,4661,4,0};
+    std::vector<int> input1 = {1,1,1,1,1,1,1,1,1,1,1,1};
+    auto first = _Input({1, (int)input0.size()}, NHWC, halide_type_of<int>());
+    ::memcpy(first->writeMap<int>(), input0.data(), input0.size() * sizeof(int));
+    auto second = _Input({1, (int)input1.size()}, NHWC, halide_type_of<int>());
+    ::memcpy(second->writeMap<int>(), input1.data(), input1.size() * sizeof(int));
+    std::vector<VARP> outputs;
+    for (int i = 0; i < 2; ++i) {
+        {
+            AUTOTIME;
+            Executor::getGlobalExecutor()->resetProfile();
+            outputs = model->onForward({first, second});
+            Executor::getGlobalExecutor()->dumpProfile();
+        }
+        std::ostringstream fileNameOs;
+        std::ostringstream dimInfo;
+        fileNameOs << i << "_output.txt";
+        auto info = outputs[0]->getInfo();
+        for (int d=0; d<info->dim.size(); ++d) {
+            dimInfo << info->dim[d] << "_";
+        }
+        auto fileName = fileNameOs.str();
+        MNN_PRINT("Output Name: %s, Dim: %s\n", fileName.c_str(), dimInfo.str().c_str());
+        auto ptr = outputs[0]->readMap<int>();
+        std::ofstream outputOs(fileName.c_str());
+        for (int i=0; i<info->size; ++i) {
+            outputOs << ptr[i] << "\n";
+        }
+    }
+    for (int i = 0; i < 10; ++i) {
+        AUTOTIME;
+        outputs = model->onForward({first, second});
+    }
+
+    
+    return 0;
+}
--- a/demo/iOS/playground/ViewController.mm
+++ b/demo/iOS/playground/ViewController.mm
@ -53,27 +53,23 @@ static int CompareElements(const LabeledElement *a, const LabeledElement *b) {
    if (!_net || !_session) {
        return nil;
    }
+    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
+    MNN::Tensor copy(output);
+    auto input = _net->getSessionInput(_session, nullptr);
+    MNN::Tensor tensorCache(input);
+    input->copyToHostTensor(&tensorCache);

    // run
    NSTimeInterval begin = NSDate.timeIntervalSinceReferenceDate;
    // you should set input data for each inference
-    if (cycles == 1) {
-        _net->runSession(_session);
-    } else {
-        auto input = _net->getSessionInput(_session, nullptr);
-        MNN::Tensor tensorCache(input);
-        input->copyToHostTensor(&tensorCache);
    for (int i = 0; i < cycles; i++) {
        input->copyFromHostTensor(&tensorCache);
        _net->runSession(_session);
-        }
+        output->copyToHostTensor(&copy);
    }
    NSTimeInterval cost = NSDate.timeIntervalSinceReferenceDate - begin;

    // result
-    MNN::Tensor *output = _net->getSessionOutput(_session, nullptr);
-    MNN::Tensor copy(output);
-    output->copyToHostTensor(&copy);
    float *data = copy.host<float>();
    LabeledElement objects[1000];
    for (int i = 0; i < 1000; i++) {
--- a/express/CMakeLists.txt
+++ b/express/CMakeLists.txt
@ -1,14 +1,21 @@
-file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+file(GLOB_RECURSE MNN_EXPR_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.*")
 option(MNN_EXPR_ENABLE_PROFILER "Support profile Expr's op cost" OFF)
+option(MNN_EXPR_SHAPE_EAGER "Force compute Expr's shape directly cost" OFF)
 IF (MNN_EXPR_ENABLE_PROFILER)
    add_definitions(-DMNN_EXPR_ENABLE_PROFILER)
 ENDIF()
+IF (MNN_EXPR_SHAPE_EAGER)
+    add_definitions(-DMNN_EXPR_SHAPE_EAGER)
+ENDIF()
 IF(MNN_SEP_BUILD)
    if (MNN_BUILD_FOR_ANDROID_COMMAND)
        set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../")
    endif()
    add_library(MNN_Express SHARED ${MNN_EXPR_SRCS})
    target_link_libraries(MNN_Express MNN)
+    if (MNN_BUILD_MINI)
+        target_link_libraries(MNN_Express $<TARGET_OBJECTS:MNNTransform>)
+    endif()
 ELSE()
    add_library(MNNExpress OBJECT ${MNN_EXPR_SRCS})
 ENDIF()
--- a/express/Distributions.cpp
+++ b/express/Distributions.cpp
@ -0,0 +1,30 @@
+//
+//  Distributions.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "Distributions.hpp"
+#include <cmath>
+
+namespace MNN {
+namespace Express {
+
+void Distributions::uniform(const int count, const float min, const float max, float *r, std::mt19937 gen) {
+    std::uniform_real_distribution<float> dis(min, std::nextafter(max, std::numeric_limits<float>::max()));
+    for (int i = 0; i < count; i++) {
+        r[i] = dis(gen);
+    }
+}
+
+void Distributions::gaussian(const int count, const float mu, const float sigma, float *r, std::mt19937 gen) {
+    std::normal_distribution<float> dis(mu, sigma);
+    for (int i = 0; i < count; i++) {
+        r[i] = dis(gen);
+    }
+}
+
+} // namespace Express
+} // namespace MNN
--- a/express/Distributions.hpp
+++ b/express/Distributions.hpp
@ -0,0 +1,27 @@
+//
+//  Distributions.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef Distributions_hpp
+#define Distributions_hpp
+
+#include <MNN/MNNDefine.h>
+#include <random>
+
+namespace MNN {
+namespace Express {
+
+class Distributions {
+public:
+    static void uniform(const int count, const float min, const float max, float* r, std::mt19937 gen);
+    static void gaussian(const int count, const float mu, const float sigma, float* r, std::mt19937 gen);
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // Distritutions_hpp
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
--- a/express/ExecutorScope.cpp
+++ b/express/ExecutorScope.cpp
@ -0,0 +1,45 @@
+//
+//  ExecutorScope.cpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <thread>
+#include <MNN/expr/Executor.hpp>
+#include <MNN/expr/Scope.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+
+namespace MNN {
+namespace Express {
+
+typedef std::shared_ptr<Express::Executor> ExecutorRef;
+#if !defined(__APPLE__)
+thread_local static Scope<ExecutorRef> g_executor_scope;
+#else
+static Scope<ExecutorRef> g_executor_scope;
+#endif
+
+ExecutorScope::ExecutorScope(const std::shared_ptr<Executor>& current) {
+    g_executor_scope.EnterScope(current);
+}
+
+ExecutorScope::ExecutorScope(const std::string& scope_name,
+                             const std::shared_ptr<Executor>& current) {
+    g_executor_scope.EnterScope(scope_name, current);
+}
+
+ExecutorScope::~ExecutorScope() {
+    g_executor_scope.ExitScope();
+}
+
+const std::shared_ptr<Executor> ExecutorScope::Current() {
+    if (g_executor_scope.ScopedLevel() > 0) {
+        return g_executor_scope.Current().content;
+    }
+    return Executor::getGlobalExecutor();
+}
+
+}  // namespace Express
+}  // namespace MNN
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@ -8,23 +8,33 @@

 #define FLATBUFFERS_PREFER_PRINTF
 #include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Executor.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <map>
-#include "core/MNNMemoryUtils.h"
 #include "Utils.hpp"
-#include <map>
 #include "core/FileLoader.hpp"
-#include <MNN/expr/Executor.hpp>
+#include "core/TensorUtils.hpp"
 #include "MNN_generated.h"
 //#define MNN_OPEN_TIME_TRACE
 #include "MNN/AutoTime.hpp"
+#include "MNN/expr/ExecutorScope.hpp"

+//#define MNN_EXPRESS_ERROR_REPORT
 static inline std::string numberToString(int index) {
    char s[10];
    snprintf(s, 10, "%d", index);
    return std::string(s);
 }

+static bool HasUnknownDim(const std::vector<int>& dims) {
+    for (const int& dim : dims) {
+        if (dim < 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
 namespace MNN {
 namespace Express {
 void Variable::Info::syncSize() {
@ -87,8 +97,7 @@ bool VARP::fix(VARP::InputType type) const {
 }

 Expr::Expr(int outputSize) {
-    mInside.reset(new Inside);
-    mInside->mOutputInfos.resize(outputSize);
+    mInside.reset(new Inside(outputSize));
    mOutputNames.resize(outputSize);
 }

@ -117,27 +126,46 @@ void Expr::_addLinkForInputs(EXPRP expr) {
        }
    }
 }
-EXPRP Expr::create(Variable::Info&& info) {
+EXPRP Expr::create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy) {
    EXPRP expr(new Expr(1));
    expr->mOp = nullptr;
-    auto originPtr = info.ptr;
+    auto originPtr = ptr;
    expr->mInside->mOutputInfos[0] = std::move(info);
    auto& dstInfo = expr->mInside->mOutputInfos[0];
-    dstInfo.syncSize();
-    if (dstInfo.size > 0) {
-        expr->mExtraBuffer.reset(new char[dstInfo.size * dstInfo.type.bytes()], std::default_delete<char[]>());
-        expr->mInside->mOutputInfos[0].ptr = expr->mExtraBuffer.get();
    expr->mInside->mInfoDirty = false;
+    dstInfo.syncSize();
+    Utils::copyInfoToTensor(expr->mInside->mOutputTensors[0], expr->mInside->mOutputInfos.data());
+    expr->mType = type;
+    if (type == VARP::CONSTANT) {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::CONSTANT;
+    } else if (type == VARP::INPUT) {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::INPUT;
    } else {
-        expr->mInside->mOutputInfos[0].ptr = nullptr;
-        expr->mInside->mInfoDirty = true;
+        // VARP::TRAINABLE
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->usage = Tensor::InsideDescribe::TRAINABLE;
+    }
+    if (dstInfo.size > 0 && copy) {
+        auto res = Utils::allocMemoryForHostTensor(expr->mInside->mOutputTensors[0]);
+        if (!res) {
+            MNN_ASSERT(false);
+            return nullptr;
+        }
+    } else {
+        expr->mInside->mOutputTensors[0]->buffer().host = nullptr;
    }
    if (nullptr == originPtr) {
-        expr->mType = VARP::INPUT;
+        if (type == VARP::INPUT && dstInfo.size > 0) {
+            expr->mInside->mContentDirty = true;
+        }
        return expr;
    }
-    expr->mType = VARP::CONSTANT;
-    ::memcpy(expr->mInside->mOutputInfos[0].ptr, originPtr, dstInfo.size * dstInfo.type.bytes());
+    expr->mInside->mContentDirty = false;
+    if (copy) {
+        ::memcpy(expr->mInside->mOutputTensors[0]->buffer().host, originPtr, dstInfo.size * dstInfo.type.bytes());
+    } else {
+        TensorUtils::getDescribe(expr->mInside->mOutputTensors[0])->memoryType = Tensor::InsideDescribe::MEMORY_OUTSIDE;
+        expr->mInside->mOutputTensors[0]->buffer().host = (uint8_t*)originPtr;
+    }
    return expr;
 }
 EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize) {
@ -147,8 +175,7 @@ EXPRP Expr::create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP
    expr->mOp = flatbuffers::GetMutableRoot<Op>(extra.first.get());
    expr->mOpBufferSize = extra.second;
    expr->mInputs   = std::move(inputs);
-    expr->mInside->mInputInfos.resize(expr->mInputs.size());
-    expr->mInside->mReq = Executor::getGlobalExecutor()->getRequirement(expr.get());
+    expr->mInside->mReq = ExecutorScope::Current()->getRequirement(expr.get());
    _addLinkForInputs(expr);
    return expr;
 }
@ -161,34 +188,34 @@ EXPRP Expr::create(const OpT* op, std::vector<VARP> inputs, int outputSize) {
            info.dim[0] = 1;
        }
        info.order = Utils::revertFormat(op->main.AsInput()->dformat);
-        info.ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsInput()->dtype);
-        return create(std::move(info));
+        return create(std::move(info), nullptr, VARP::INPUT);
    }
    if (OpType_Const == op->type || OpType_TrainableParam == op->type) {
        Variable::Info info;
        info.dim = op->main.AsBlob()->dims;
        info.order = Utils::revertFormat(op->main.AsBlob()->dataFormat);
-        info.ptr = nullptr;
+        void* ptr = nullptr;
        info.type = Utils::revertDataType(op->main.AsBlob()->dataType);
        switch (op->main.AsBlob()->dataType) {
            case DataType_DT_INT8:
-                info.ptr = (void*)op->main.AsBlob()->int8s.data();
+                ptr = (void*)op->main.AsBlob()->int8s.data();
                break;
            case DataType_DT_INT32:
-                info.ptr = (void*)op->main.AsBlob()->int32s.data();
+                ptr = (void*)op->main.AsBlob()->int32s.data();
                break;
            case DataType_DT_UINT8:
-                info.ptr = (void*)op->main.AsBlob()->uint8s.data();
+                ptr = (void*)op->main.AsBlob()->uint8s.data();
                break;
            case DataType_DT_FLOAT:
-                info.ptr = (void*)op->main.AsBlob()->float32s.data();
+                ptr = (void*)op->main.AsBlob()->float32s.data();
                break;
            default:
                break;
        }
-        auto expr = create(std::move(info));
-        if (OpType_TrainableParam == op->type) {
+        //MNN_ASSERT(nullptr != ptr);
+        auto expr = create(std::move(info), ptr, VARP::CONSTANT);
+        if (OpType_TrainableParam == op->type && nullptr != ptr) {
            expr->mType = VARP::TRAINABLE;
        }
        return expr;
@ -213,7 +240,7 @@ bool Expr::requireInfo() {
        return false;
    }
    if (nullptr == mOp) {
-        return mInside->mOutputInfos[0].size > 0;
+        return !HasUnknownDim(mInside->mOutputInfos[0].dim);
    }
    bool ready     = true;
    for (int i = 0; i < mInputs.size(); ++i) {
@ -221,8 +248,8 @@ bool Expr::requireInfo() {
            // The Variable is set nullptr by api
            return false;
        }
-        mInside->mInputInfos[i] = mInputs[i]->getInfo();
-        if (nullptr == mInside->mInputInfos[i] && (!mInside->mReq.supportError[i])) {
+        auto inputInfo = mInputs[i]->getInfo();
+        if (nullptr == inputInfo) {
 #ifdef MNN_EXPRESS_ERROR_REPORT
            MNN_ERROR("%s, %d input not ready\n", mName.c_str(), i);
 #endif
@ -233,15 +260,19 @@ bool Expr::requireInfo() {
    for (int i = 0; i < mInputs.size(); ++i) {
        auto& v  = mInputs[i];
        if (mInside->mReq.shapeNeedContent[i]) {
-            // `readInternal` maybe return nullptr if element count is 0.
-            v->readInternal(true);
+            // For shape need content, the content must not be nullptr
+            auto ptr = v->readInternal(true);
+            if (nullptr == ptr) {
+                ready = false;
+                break;
+            }
        }
    }
    if (!ready) {
        return false;
    }
    //MNN_PRINT("Info %s, %p Start\n", mName.c_str(), this);
-    auto res   = Executor::getGlobalExecutor()->computeInfo(this);
+    auto res   = ExecutorScope::Current()->computeInfo(this);
    //MNN_PRINT("Info Compute %s\n", mName.c_str());

    if (NO_ERROR == res) {
@ -261,6 +292,14 @@ const std::vector<WeakEXPRP>& Variable::toExprs() const {

 VARP Variable::create(EXPRP expr, int index) {
    VARP res(new Variable(expr, index));
+#ifdef MNN_EXPR_SHAPE_EAGER
+    auto info = expr->requireInfo();
+    if (!info) {
+#ifdef MNN_EXPRESS_ERROR_REPORT
+        MNN_ERROR("Can't compute shape\n");
+#endif
+    }
+#endif
    return res;
 }
 void Expr::replace(EXPRP old, EXPRP from) {
@ -307,16 +346,22 @@ void Expr::replace(EXPRP old, EXPRP from) {
    old->mValid = from->mValid;
    old->mInside = from->mInside;
    old->mInputs = from->mInputs;
+    std::vector<Expr*> visited;
    old->visitOutputs([&](EXPRP expr, int index) {
-        if (expr->mInside->mInfoDirty && expr->mValid && !expr->mInside->mLinkCache) {
+        if (expr->visited()) {
            return false;
        }
+        visited.emplace_back(expr.get());
+        expr->setVisited(true);
        expr->mInside->mCache.reset();
        expr->mInside->mCacheOffset = 0;
        expr->mValid = true;
        expr->mInside->mInfoDirty = true;
        return true;
    });
+    for (auto e : visited) {
+        e->setVisited(false);
+    }
 }

 void Variable::setName(const std::string& name) {
@ -351,7 +396,7 @@ bool Variable::input(VARP src) {
        info = tempInfo.get();
    }
    auto dstInfo = getInfo();
-    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size();
+    bool needChange = nullptr == dstInfo || info->order != dstInfo->order || info->dim.size() != dstInfo->dim.size() || info->type != dstInfo->type;
    if (!needChange) {
        for (int i=0; i<info->dim.size(); ++i) {
            if (dstInfo->dim[i] != info->dim[i]) {
@ -362,22 +407,19 @@ bool Variable::input(VARP src) {
    }

    if (!mFrom->mInside->mCache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, false);
+        ExecutorScope::Current()->makeCache({mFrom}, false);
    }
    if (needChange) {
-        bool needAlloc = info->size * info->type.bytes() > mFrom->mInside->mOutputInfos[0].size * mFrom->mInside->mOutputInfos[0].type.bytes();
        mFrom->mInside->mOutputInfos[0] = *info;
-        if (needAlloc) {
-            mFrom->mExtraBuffer.reset(new char[info->size * info->type.bytes()], std::default_delete<char[]>());
-        }
-        mFrom->mInside->mOutputInfos[0].ptr = mFrom->mExtraBuffer.get();
-        mFrom->mInside->mCache->setShapeDirty(0, mFrom->outputInfo(0));
+        Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+        Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
+        Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
    }
    if (info->size) {
        auto dstPtr = writeInternal(false);
        auto srcPtr = src->readMap<void>();
        if (nullptr == dstPtr || nullptr == srcPtr) {
-            MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
+            //MNN_ERROR("Alloc memory error or compute src error in Variable::Input\n");
            return false;
        }
        ::memcpy(dstPtr, srcPtr, info->size * info->type.bytes());
@ -387,7 +429,7 @@ bool Variable::input(VARP src) {
    } else {
        informDirty();
    }
-    mFrom->mInside->mCache->setContentReady();
+    mFrom->mInside->mContentDirty = false;
    return true;
 }

@ -396,23 +438,44 @@ void Variable::replace(VARP dst, VARP src) {
        dst->setExpr(nullptr, 0);
        return;
    }
+    if (nullptr == dst) {
+        dst.mContent = src.mContent;
+        return;
+    }
    if (src->mFrom.get() == dst->mFrom.get()) {
        dst->mFromIndex = src->mFromIndex;
        return;
    }
    if (src->mFrom->outputSize() != dst->mFrom->outputSize()) {
        // Can't replace Expr, Just replace VARP
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
-            src->mFrom->mTo.emplace_back(expr);
+        std::vector<Expr*> visited;
+        dst->mFrom->visitOutputs([src, dst, &visited](EXPRP expr, int index) {
+            if (expr->visited()) {
                return false;
-        });
-        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            }
+            expr->setVisited(true);
+            visited.emplace_back(expr.get());
            expr->mInside->mCache.reset();
            expr->mInside->mCacheOffset = 0;
            expr->mValid = true;
            expr->mInside->mInfoDirty = true;
+            expr->mInside->mContentDirty = true;
            return true;
        });
+        for (auto v : visited) {
+            v->setVisited(false);
+        }
+        dst->mFrom->visitOutputs([src, dst](EXPRP expr, int index) {
+            for (int i =0; i< expr->inputs().size(); ++i) {
+                auto input = expr->inputs()[i];
+                if (input == dst) {
+                    expr->mInputs[i] = src;
+                }
+            }
+            src->mFrom->mTo.emplace_back(expr);
+            return false;
+        });
+
        dst->mFrom = src->mFrom;
        dst->mFromIndex = src->mFromIndex;
        return;
@ -452,15 +515,19 @@ bool Variable::resize(INTS dims) {
    }
    info.dim = dims;
    info.syncSize();
-    mFrom->mExtraBuffer.reset(new char[info.size * info.type.bytes()], std::default_delete<char[]>());
-    info.ptr = mFrom->mExtraBuffer.get();
+    Utils::copyInfoToTensor(mFrom->inside()->mOutputTensors[0], mFrom->inside()->mOutputInfos.data());
+    Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+    if (0 >= info.size) {
+        return false;
+    }
+    bool res = Utils::allocMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+    if (!res) {
+        return false;
+    }

    mFrom->mValid = true;
-    mFrom->mInside->mInputInfos.clear();
-    auto cache = mFrom->mInside->mCache;
-    if (nullptr != cache) {
-        cache->setShapeDirty(0, mFrom->outputInfo(0));
-    }
+    mFrom->inside()->mInfoDirty = false;
+    mFrom->inside()->mContentDirty = true;
    mFrom->visitOutputs([](EXPRP expr, int index) { return expr->setInfoDirty(); });
    return true;
 }
@ -478,11 +545,12 @@ void Expr::visit(EXPRP expr, const std::function<bool(EXPRP)>& before, const std
 void* Variable::readInternal(bool forShape) {
    if (nullptr == mFrom->get()) {
        if (VARP::INPUT == mFrom->mType) {
-            if (nullptr == mFrom->mInside->mCache) {
+            if (mFrom->mInside->mContentDirty) {
                return nullptr;
            }
        }
-        return mFrom->outputInfo(mFromIndex)->ptr;
+        //MNN_ASSERT(nullptr != mFrom->inside()->mOutputTensors[0]->buffer().host);
+        return mFrom->inside()->mOutputTensors[0]->buffer().host;
    }
    auto res = mFrom->requireInfo();
    if (false == res) {
@ -490,21 +558,26 @@ void* Variable::readInternal(bool forShape) {
    }
    auto cache = mFrom->inside()->mCache;
    if (nullptr == cache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom}, forShape);
+        ExecutorScope::Current()->makeCache({mFrom}, forShape);
        cache = mFrom->inside()->mCache;
    }
    if (nullptr == cache) {
        return nullptr;
    }
-    if (NO_ERROR != Executor::getGlobalExecutor()->runCache(cache)) {
+    if (NO_ERROR != ExecutorScope::Current()->runCache(cache)) {
        return nullptr;
    }
-    cache->syncOutput(mFrom->mInside->mCacheOffset + mFromIndex, mFrom->outputInfo(mFromIndex));
-    return mFrom->outputInfo(mFromIndex)->ptr;
+    return Executor::mapOutput(cache.get(), mFrom->mInside->mCacheOffset + mFromIndex, mFrom->mInside->mOutputTensors[mFromIndex]);
 }

 void Variable::informDirty() {
-    mFrom->visitOutputs([](EXPRP expr, int index) {
+    std::vector<Expr*> visited;
+    mFrom->visitOutputs([&visited](EXPRP expr, int index) {
+        if (expr->visited()) {
+            return false;
+        }
+        visited.emplace_back(expr.get());
+        expr->setVisited(true);
        if (expr->inside()->mReq.shapeNeedContent.empty()) {
            // Not init
            return false;
@ -514,28 +587,32 @@ void Variable::informDirty() {
            expr->visitOutputs([](EXPRP e, int index) { return e->setInfoDirty(); });
            return false;
        }
-        if (expr->inside()->mContentDirty) {
-            return false;
-        }
-        expr->inside()->mContentDirty = true;
        if (expr->inside()->mReq.contentNeedContent[index]) {
            if (expr->inside()->mCache != nullptr) {
-                expr->inside()->mCache->setContentDirty();
+                Executor::setContentDirty(expr->inside()->mCache.get());
            }
            return true;
        }
        return false;
    });
+    for (auto e : visited) {
+        e->setVisited(false);
+    }
 }
 void Variable::prepareCompute(const std::vector<VARP>& vars, bool forceCpu) {
    std::vector<EXPRP> exprs;
    for (auto v : vars) {
-        if (v->expr().first->inside()->mCache == nullptr) {
+        if (!v->expr().first->visited()) {
+            v->expr().first->inside()->mCache = nullptr;
            v->expr().first->requireInfo();
+            v->expr().first->setVisited(true);
            exprs.emplace_back(v->expr().first);
        }
    }
-    Executor::getGlobalExecutor()->makeCache(std::move(exprs), forceCpu);
+    for (auto v : vars) {
+        v->expr().first->setVisited(false);
+    }
+    ExecutorScope::Current()->makeCache(std::move(exprs), forceCpu);
 }

 void* Variable::writeInternal(bool inform) {
@ -545,16 +622,8 @@ void* Variable::writeInternal(bool inform) {
    if (inform) {
        informDirty();
    }
-    auto cache = mFrom->mInside->mCache;
-    if (nullptr == cache) {
-        Executor::getGlobalExecutor()->makeCache({mFrom});
-        cache = mFrom->mInside->mCache;
-    }
-    if (nullptr == cache) {
-        return nullptr;
-    }
-    mFrom->mInside->mCache->setContentReady();
-    return mFrom->mInside->mOutputInfos[0].ptr;
+    mFrom->mInside->mContentDirty = false;
+    return mFrom->inside()->mOutputTensors[0]->host<void>();
 }

 void Variable::unMap() {
@ -591,12 +660,17 @@ bool Expr::setInfoDirty() {
    mInside->mContentDirty = true;
    mValid = true;
    if (mInside->mCache != nullptr) {
-        mInside->mCache->setShapeDirty(0, nullptr);
+        Executor::setShapeDirty(mInside->mCache.get());
+    }
+    for (auto o : mInside->mOutputTensors) {
+        Utils::releaseMemoryForHostTensor(o);
    }
    return true;
 }

 std::vector<VARP> Variable::load(const char* fileName) {
+    AutoStorage<uint8_t> buffer;
+    {
        FileLoader loader(fileName);
        if (!loader.valid()) {
            MNN_ERROR("Error for open %s\n", fileName);
@ -606,11 +680,11 @@ std::vector<VARP> Variable::load(const char* fileName) {
        if (!loader.valid()) {
            return {};
        }
-    AutoStorage<uint8_t> buffer;
        loader.merge(buffer);
        if (buffer.get() == nullptr) {
            return {};
        }
+    }
    return load(buffer.get(), buffer.size());
 }
 std::vector<VARP> Variable::load(const uint8_t* buffer, size_t length) {
@ -722,6 +796,7 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        } else {
            MNN_ASSERT(1 == expr->outputSize());
            auto& info = expr->mInside->mOutputInfos[0];
+            auto ptr = expr->mInside->mOutputTensors[0]->host<void>();
            op.reset(new OpT);
            if (expr->mType != VARP::INPUT) {
                auto blob        = new BlobT;
@ -730,16 +805,20 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
                if (info.type.code == halide_type_float) {
                    blob->dataType = DataType_DT_FLOAT;
                    blob->float32s.resize(info.size);
-                    ::memcpy(blob->float32s.data(), info.ptr, info.size * sizeof(float));
-                } else if (info.type.code == halide_type_int) {
+                    ::memcpy(blob->float32s.data(), ptr, info.size * sizeof(float));
+                } else if (info.type.code == halide_type_int && info.type.bits == 32) {
                    blob->dataType = DataType_DT_INT32;
                    blob->int32s.resize(info.size);
-                    ::memcpy(blob->int32s.data(), info.ptr, info.size * sizeof(int));
-                }
-                else if (info.type.code == halide_type_uint && info.type.bits == 8) {
+                    ::memcpy(blob->int32s.data(), ptr, info.size * sizeof(int));
+                } else if (info.type.code == halide_type_int && info.type.bits == 8) {
+                    blob->dataType = DataType_DT_INT8;
+                    blob->int8s.resize(info.size);
+                    auto pptr = (int8_t *)ptr;
+                    ::memcpy(blob->int8s.data(), ptr, info.size * sizeof(int8_t));
+                } else if (info.type.code == halide_type_uint && info.type.bits == 8) {
                    blob->dataType = DataType_DT_UINT8;
                    blob->uint8s.resize(info.size);
-                    ::memcpy(blob->uint8s.data(), info.ptr, info.size * sizeof(uint8_t));
+                    ::memcpy(blob->uint8s.data(), ptr, info.size * sizeof(uint8_t));
                }
                op->type       = OpType_Const;
                if (expr->mType == VARP::TRAINABLE) {
@ -781,12 +860,12 @@ void Variable::save(const std::vector<VARP>& vars, NetT* dest) {
        auto op = dest->oplists[index].get();
        auto tensorIndexOffset = varIndexInfo[expr];
        for (int v=0; v<expr->outputSize(); ++v) {
-            auto const tensorIndex = tensorIndexOffset + v;
-            if (dest->tensorName[tensorIndex].empty()) {
+            auto subindex = tensorIndexOffset + v;
+            if (dest->tensorName[subindex].empty()) {
                if (v == 0) {
-                    dest->tensorName[tensorIndex] = op->name;
+                    dest->tensorName[subindex] = op->name;
                } else {
-                    dest->tensorName[tensorIndex] = op->name + numberToString(v);
+                    dest->tensorName[subindex] = op->name + numberToString(v);
                }
            }
        }
--- a/express/Initializer.cpp
+++ b/express/Initializer.cpp
@ -0,0 +1,210 @@
+//
+//  Initializer.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "Initializer.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include <cmath>
+#include <vector>
+#include "Distributions.hpp"
+#include "RandomGenerator.hpp"
+
+namespace MNN {
+namespace Express {
+
+Express::VARP Initializer::createConstVar(Express::INTS dim, Express::Dimensionformat format) {
+    auto res = Express::_Input(dim, format, halide_type_of<float>());
+    this->onExecute(res);
+    res.fix(Express::VARP::CONSTANT);
+    return res;
+}
+
+class ConstantInitializer : public Initializer {
+public:
+    ConstantInitializer(float value) : mConstant(value) {
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        auto ptr = p->writeMap<float>();
+        for (int i = 0; i < count; i++) {
+            ptr[i] = mConstant;
+        }
+    }
+
+private:
+    float mConstant;
+};
+Initializer* Initializer::constValue(float value) {
+    return new ConstantInitializer(value);
+}
+
+class UniformInitializer : public Initializer {
+public:
+    UniformInitializer(float min = 0, float max = 1) {
+        mMin = min;
+        mMax = max;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        Distributions::uniform(count, mMin, mMax, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    float mMin;
+    float mMax;
+};
+
+Initializer* Initializer::uniform(float minValue, float maxValue) {
+    return new UniformInitializer(minValue, maxValue);
+}
+
+class XavierInitializer : public Initializer {
+public:
+    XavierInitializer(VarianceNorm norm = FANIN) {
+        mNorm = norm;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int fanIn  = count / dims[0];
+        int fanOut = dims.size() > 1 ? count / dims[1] : count;
+        float n    = fanIn; // default: FANIN
+        if (mNorm == VarianceNorm::AVERAGE) {
+            n = (fanIn + fanOut) / 2.0f;
+        } else if (mNorm == VarianceNorm::FANOUT) {
+            n = fanOut;
+        }
+        float scale = sqrtf(3.0f / n);
+
+        Distributions::uniform(count, -scale, scale, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    VarianceNorm mNorm;
+};
+Initializer* Initializer::xavier(VarianceNorm norm) {
+    return new XavierInitializer(norm);
+}
+
+class GaussianInitializer : public Initializer {
+public:
+    GaussianInitializer(float mean = 0, float std = 1) {
+        mMean = mean;
+        mStd  = std;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        Distributions::gaussian(count, mMean, mStd, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    float mMean;
+    float mStd;
+};
+Initializer* Initializer::gauss(float mean, float std) {
+    return new GaussianInitializer(mean, std);
+}
+
+class MSRAInitializer : public Initializer {
+public:
+    MSRAInitializer(VarianceNorm norm = FANIN) {
+        mNorm = norm;
+    }
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int fanIn  = count / dims[0];
+        int fanOut = dims.size() > 1 ? count / dims[1] : count;
+        float n    = fanIn; // default: FANIN
+        if (mNorm == VarianceNorm::AVERAGE) {
+            n = (fanIn + fanOut) / 2.0f;
+        } else if (mNorm == VarianceNorm::FANOUT) {
+            n = fanOut;
+        }
+        float std = sqrtf(2.0f / n);
+
+        Distributions::gaussian(count, 0.0f, std, p->writeMap<float>(), RandomGenerator::generator());
+    }
+
+private:
+    VarianceNorm mNorm;
+};
+Initializer* Initializer::MSRA(VarianceNorm norm) {
+    return new MSRAInitializer(norm);
+}
+
+class BilinearInitializer : public Initializer {
+public:
+    BilinearInitializer() = default;
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        MNN_ASSERT(dims.size() == 4);
+        MNN_ASSERT(dims[2] == dims[3]); // NCHW, H == W
+        // referenced from Caffe
+        // https://github.com/BVLC/caffe/blob/master/include/caffe/filler.hpp
+        int f   = ceilf(dims[3] / 2.0f);
+        float c = (dims[3] - 1) / (2.0f * f);
+        auto ptr = p->writeMap<float>();
+
+        for (int i = 0; i < count; i++) {
+            float x                 = i % dims[3];
+            float y                 = (i / dims[3]) % dims[2];
+            ptr[i] = (1 - std::fabs(x / f - c)) * (1 - std::fabs(y / f - c));
+        }
+    }
+};
+Initializer* Initializer::bilinear() {
+    return new BilinearInitializer();
+}
+
+class PositiveUnitball : public Initializer {
+public:
+    PositiveUnitball() = default;
+
+    virtual void onExecute(Express::VARP p) override {
+        const int count = p->getInfo()->size;
+        MNN_ASSERT(count > 0);
+        const std::vector<int> dims = p->getInfo()->dim;
+        auto ptr = p->writeMap<float>();
+
+        Distributions::uniform(count, 0, 1, ptr, RandomGenerator::generator());
+
+        int dim = count / dims[0];
+        for (int i = 0; i < dims[0]; i++) {
+            float sum = 0;
+            for (int j = 0; j < dim; j++) {
+                sum += ptr[i * dim + j];
+            }
+            for (int j = 0; j < dim; j++) {
+                ptr[i * dim + j] = ptr[i * dim + j] / sum;
+            }
+        }
+    }
+};
+Initializer* Initializer::positiveUnitball() {
+    return new PositiveUnitball();
+}
+
+} // namespace Express
+} // namespace MNN
--- a/express/Initializer.hpp
+++ b/express/Initializer.hpp
@ -0,0 +1,43 @@
+//
+//  Initializer.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef Initializer_hpp
+#define Initializer_hpp
+
+#include <MNN/expr/Expr.hpp>
+
+namespace MNN {
+namespace Express {
+class RandomGenerator;
+class MNN_PUBLIC Initializer {
+public:
+    Initializer()          = default;
+    virtual ~Initializer() = default;
+    Express::VARP createConstVar(Express::INTS dim, Express::Dimensionformat format = Express::NCHW);
+    virtual void onExecute(Express::VARP p) = 0;
+
+    static Initializer* constValue(float value);
+    static Initializer* uniform(float minValue = 0.0f, float maxValue = 1.0f);
+
+    enum VarianceNorm {
+        FANIN,
+        FANOUT,
+        AVERAGE,
+    };
+
+    static Initializer* xavier(VarianceNorm norm = FANIN);
+    static Initializer* gauss(float mean = 0.0f, float std = 1.0f);
+    static Initializer* MSRA(VarianceNorm norm = FANIN);
+    static Initializer* bilinear();
+    static Initializer* positiveUnitball();
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // Initializer_hpp
--- a/express/MathOp.cpp
+++ b/express/MathOp.cpp
@ -30,7 +30,18 @@ static DataType _convertDataType(halide_type_t type) {
    }
    return DataType_DT_INVALID;
 }
+static VARP _checkNC4HW4(VARP x) {
+#ifdef MNN_EXPR_SHAPE_EAGER
+    auto info = x->getInfo();
+    if (nullptr != info && info->order == NC4HW4) {
+        return _Convert(x, NCHW);
+    }
+#endif
+    return x;
+}
 static VARP _Binary(VARP x, VARP y, BinaryOpOperation operation) {
+    x = _checkNC4HW4(x);
+    y = _checkNC4HW4(y);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                 = OpParameter_BinaryOp;
    op->type                      = OpType_BinaryOp;
@ -49,6 +60,7 @@ static VARP _Unary(VARP x, UnaryOpOperation operation) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
+    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -60,6 +72,7 @@ static VARP _Reduce(VARP x, INTS dim, ReductionType type, bool keepDim) {
    return (Variable::create(Expr::create(op.get(), {x})));
 }
 static VARP _ReduceMutable(VARP x, VARP dim, ReductionType type, bool keepDim) {
+    x = _checkNC4HW4(x);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                          = OpParameter_ReductionParam;
    op->type                               = OpType_Reduction;
@ -955,6 +968,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMax(VARP input, int axis) {
+    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMax;
@ -976,6 +990,7 @@ Returns:
 A variable of type int.
 */
 VARP _ArgMin(VARP input, int axis) {
+    input = _checkNC4HW4(input);
    std::unique_ptr<OpT> op(new OpT);
    op->main.type                         = OpParameter_ArgMax;
    op->type                              = OpType_ArgMin;
--- a/express/MergeOptimizer.hpp
+++ b/express/MergeOptimizer.hpp
@ -5,6 +5,7 @@
 //  Created by MNN on 2019/08/20.
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
+
 #ifndef MergeOptimizer_hpp
 #define MergeOptimizer_hpp

--- a/express/NeuralNetWorkOp.cpp
+++ b/express/NeuralNetWorkOp.cpp
@ -54,16 +54,14 @@ VARP _Input(INTS shape, Dimensionformat data_format, halide_type_t dtype) {
    info.dim = std::move(shape);
    info.order = data_format;
    info.type = dtype;
-    info.ptr = nullptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), nullptr, VARP::INPUT)));
 }
 VARP _Scalar(const void* ptr, halide_type_t type) {
    Variable::Info info;
    info.dim = {};
    info.order = NHWC;
    info.type = type;
-    info.ptr = (void*)ptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }
 /*create a constant variable.
 Args:
@ -79,8 +77,7 @@ VARP _Const(const void* ptr, INTS shape, Dimensionformat format, halide_type_t t
    info.dim = std::move(shape);
    info.order = format;
    info.type = type;
-    info.ptr = (void*)ptr;
-    return (Variable::create(Expr::create(std::move(info))));
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }

 VARP _Const(float value, INTS shape, Dimensionformat format) {
@ -93,8 +90,8 @@ VARP _Const(float value, INTS shape, Dimensionformat format) {
    for (int i = 0; i < info.size; ++i) {
        values[i] = value;
    }
-    info.ptr = (void*)values.data();
-    return (Variable::create(Expr::create(std::move(info))));
+    auto ptr = (void*)values.data();
+    return (Variable::create(Expr::create(std::move(info), ptr, VARP::CONSTANT)));
 }

 VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format, halide_type_t type) {
@ -107,6 +104,23 @@ VARP _TrainableParam(float value, INTS dims, Dimensionformat format) {
    v.fix(VARP::TRAINABLE);
    return v;
 }
+VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape) {
+    std::unique_ptr<OpT> ipOp(new OpT);
+    ipOp->type = OpType_InnerProduct;
+    ipOp->main.type  = OpParameter_InnerProduct;
+    ipOp->main.value = new InnerProductT;
+    auto ipParam        = ipOp->main.AsInnerProduct();
+
+    ipParam->outputCount = outputShape[1];
+    if(!bias.empty()) {
+        ipParam->biasTerm = 1;
+    }
+    ipParam->weightSize = weight.size();
+    
+    ipParam->weight = std::move(weight);
+    ipParam->bias = std::move(bias);
+    return (Variable::create(Expr::create(ipOp.get(), {x})));
+}

 VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads) {
    std::unique_ptr<OpT> convOp(new OpT);
@ -183,7 +197,7 @@ VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }
 VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_Convolution;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -285,6 +299,42 @@ VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad, INTS stride, INTS
    return (Variable::create(Expr::create(std::move(convOp), {x, weight})));
 }

+VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
+           PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, bool relu6) {
+    std::unique_ptr<OpT> convOp(new OpT);
+    convOp->type = OpType_Deconvolution;
+    if (channel[0] == channel[1] && channel[0] == group) {
+        convOp->type = OpType_DeconvolutionDepthwise;
+    }
+    convOp->main.type  = OpParameter_Convolution2D;
+    convOp->main.value = new Convolution2DT;
+    auto conv2D        = convOp->main.AsConvolution2D();
+    conv2D->common.reset(new Convolution2DCommonT);
+    conv2D->common->padMode     = _convertPadMode(pad);
+    if (pads.size() == 2) {
+        conv2D->common->padX        = pads[0];
+        conv2D->common->padY        = pads[1];
+    } else {
+        conv2D->common->pads = std::move(pads);
+    }
+    conv2D->common->strideX     = stride[0];
+    conv2D->common->strideY     = stride[1];
+    conv2D->common->group       = group;
+    conv2D->common->outputCount = channel[1];
+    conv2D->common->inputCount  = channel[0];
+    conv2D->common->dilateX     = dilate[0];
+    conv2D->common->dilateY     = dilate[1];
+    conv2D->common->kernelX     = kernelSize[0];
+    conv2D->common->kernelY     = kernelSize[1];
+    conv2D->common->relu6 = relu6;
+    conv2D->common->relu = relu;
+    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->weight = std::move(weight);
+    MNN_ASSERT(bias.size() == channel[1]);
+    conv2D->bias = std::move(bias);
+    return (Variable::create(Expr::create(convOp.get(), {x})));
+}
+
 static VARP _Pool(VARP x, INTS kernel, INTS stride, PoolType type, PaddingMode pad, INTS pads) {
    std::unique_ptr<OpT> pool(new OpT);
    pool->type       = OpType_Pooling;
@ -381,9 +431,13 @@ x: A variable.
 Returns:
 output: A variable with the same type as `x`.
 */
-VARP _Relu6(VARP x) {
+VARP _Relu6(VARP x, float minValue, float maxValue) {
    std::unique_ptr<OpT> relu(new OpT);
    relu->type = OpType_ReLU6;
+    relu->main.value = new Relu6T;
+    relu->main.type = OpParameter_Relu6;
+    relu->main.AsRelu6()->maxValue = maxValue;
+    relu->main.AsRelu6()->minValue = minValue;
    return (Variable::create(Expr::create(relu.get(), {x})));
 }
 /*Given an input value x, it computes the output as x if x > 0 and slopes * x if x <= 0. 
@ -746,9 +800,12 @@ input: A variable.
 Returns:
 A variable of Halide_Type_Int.
 */ 
-VARP _Shape(VARP input) {
+VARP _Shape(VARP input, bool nchw) {
    std::unique_ptr<OpT> shape(new OpT);
    shape->type = OpType_Shape;
+    if (nchw) {
+        shape->defaultDimentionFormat = MNN_DATA_FORMAT_NCHW;
+    }
    return (Variable::create(Expr::create(std::move(shape), {input})));
 }
 /*Stacks a list of rank-R variables into one rank-(R+1) variable.
@ -906,6 +963,21 @@ VARP _Elu(VARP features, float alpha) {
    op->main.value = eluParam;
    return (Variable::create(Expr::create(std::move(op), {features})));
 }
+/*Given an input value x, it computes the output as 1.0 if x > threshold and 0.0 if x <= threshold.
+features: A variable of type Halide_Type_Float
+threshold: threshold value
+Returns:
+A variable. Has the same type as features.
+*/
+VARP _Threshold(VARP features, float threshold) {
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_Threshold;
+    auto eluParam = new ELUT;
+    op->main.type = OpParameter_ELU;
+    eluParam->alpha = threshold;
+    op->main.value = eluParam;
+    return (Variable::create(Expr::create(std::move(op), {features})));
+}
 /*Computes the size of the variable
 Args:
 input: A variable of type Halide_Type_Float or Halide_Type_Int
@ -1049,7 +1121,6 @@ std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims) {
    op->main.type = OpParameter_MomentsParam;
    momentsParam->dim = axis;
    momentsParam->keepDims = keepDims;
-    momentsParam->dType = (MNN::DataType)Utils::convertDataType(x->getInfo()->type);
    op->main.value = momentsParam;
    EXPRP expr = Expr::create(std::move(op), {x}, 2);
    std::vector<VARP> res;
@ -1405,7 +1476,7 @@ VARP _ZeroGrad(VARP x) {
 }

 VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu) {
+                              PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits) {
    std::unique_ptr<OpT> convOp(new OpT);
    convOp->type = OpType_ConvInt8;
    if (channel[0] == channel[1] && channel[0] == group) {
@ -1433,9 +1504,16 @@ VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<fl
    conv2D->symmetricQuan->bias = std::move(bias);
    conv2D->symmetricQuan->scale = std::move(scale);
    conv2D->symmetricQuan->weight = std::move(weight);
+    conv2D->symmetricQuan->nbits = nbits;
    return (Variable::create(Expr::create(convOp.get(), {x})));
 }

+VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim) {
+    std::unique_ptr<MNN::OpT> cosineSimilarityOp(new MNN::OpT);
+    cosineSimilarityOp->type = MNN::OpType_CosineSimilarity;
+    return (Variable::create(Expr::create(std::move(cosineSimilarityOp), {input0, input1, inputDim})));
+}
+
 VARP _FloatToInt8(VARP x, VARP scale, char minValue/*For future*/, char maxValue/*For future*/) {
    auto xInfo = x->getInfo();
    auto scaleInfo = scale->getInfo();
--- a/express/Optimizer.cpp
+++ b/express/Optimizer.cpp
@ -22,28 +22,7 @@ Optimizer::Parameters::~Parameters() {
    }
 }
 std::shared_ptr<Optimizer> Optimizer::create(Config config) {
-    const int numThread = config.numThread;
-    auto forwardType = config.forwardType;
-    if (forwardType != MNN_FORWARD_ALL) {
-        if (MNNGetExtraBackendCreator(forwardType) == nullptr) {
-            return nullptr;
-        }
-        return std::shared_ptr<Optimizer>(new MergeOptimizer(config.forwardType, numThread, nullptr));
-    }
-
-    auto device = config.device;
-    if (CPU == device) {
-        return std::shared_ptr<Optimizer>(new MergeOptimizer(MNN_FORWARD_CPU, numThread, nullptr));
-    }
-    if (GPU == device) {
-        std::vector<MNNForwardType> types {MNN_FORWARD_METAL, MNN_FORWARD_OPENCL, MNN_FORWARD_VULKAN, MNN_FORWARD_OPENGL};
-        for (auto type : types) {
-            auto creator = MNNGetExtraBackendCreator(type);
-            if (nullptr != creator) {
-                return std::shared_ptr<Optimizer>(new MergeOptimizer(type, numThread, nullptr));
-            }
-        }
-    }
+    // Do nothing
    return nullptr;
 }

--- a/express/RandomGenerator.hpp
+++ b/express/RandomGenerator.hpp
@ -0,0 +1,45 @@
+//
+//  RandomGenerator.hpp
+//  MNN
+//
+//  Created by MNN on 2019/11/28.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef RandomGenerator_hpp
+#define RandomGenerator_hpp
+
+#include <MNN/MNNDefine.h>
+#include <random>
+
+namespace MNN {
+namespace Express {
+
+class MNN_PUBLIC RandomGenerator {
+private:
+    RandomGenerator(int seed = std::random_device()()) {
+        mSeed = seed;
+        mGenerator.seed(mSeed);
+    }
+
+    ~RandomGenerator() = default;
+
+    RandomGenerator(RandomGenerator &);
+
+    RandomGenerator &operator=(const RandomGenerator &);
+
+private:
+    int mSeed;
+    std::mt19937 mGenerator;
+
+public:
+    static std::mt19937 &generator(int seed = std::random_device()()) {
+        static RandomGenerator rng(seed);
+        return rng.mGenerator;
+    }
+};
+
+} // namespace Express
+} // namespace MNN
+
+#endif // RandomGenerator_hpp
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@ -10,8 +10,24 @@
 #include <map>
 #include "MNN_generated.h"
 #include "core/TensorUtils.hpp"
+#include "core/MNNMemoryUtils.h"
 namespace MNN {
 namespace Express {
+Expr::Inside::Inside(int outputSize) {
+    mOutputInfos.resize(outputSize);
+    mOutputTensors.resize(outputSize);
+    for (int i=0; i<outputSize; ++i) {
+        mOutputTensors[i] = new Tensor;
+        TensorUtils::getDescribe(mOutputTensors[i])->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
+    }
+}
+Expr::Inside::~Inside() {
+    for (auto t : mOutputTensors) {
+        delete t;
+    }
+}
+
+
 #define CONVERT(src, dst, f)\
 if (f == src) return dst;

@ -61,7 +77,6 @@ void Utils::copyInfoToTensor(Tensor* dest, const Variable::Info* source) {
    }
    dest->buffer().dimensions                       = (int)source->dim.size();
    dest->buffer().type                             = source->type;
-    dest->buffer().host                             = (uint8_t*)source->ptr;
    TensorUtils::getDescribe(dest)->dimensionFormat = (MNN_DATA_FORMAT)Utils::convertFormat(source->order);
    TensorUtils::setLinearLayout(dest);
 }
@ -70,7 +85,31 @@ void Utils::copyTensorToInfo(Variable::Info* shape, const Tensor* tensor) {
    shape->dim   = tensor->shape();
    shape->size  = tensor->elementSize();
    shape->order = Utils::revertFormat(TensorUtils::getDescribe(tensor)->dimensionFormat);
-    shape->ptr   = tensor->host<float>();
+}
+bool Utils::allocMemoryForHostTensor(Tensor* dest) {
+    if (nullptr != dest->buffer().host) {
+        return true;
+    }
+    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
+        return false;
+    }
+    auto size = dest->size();
+    if (0 >= size) {
+        return false;
+    }
+    dest->buffer().host = (uint8_t*)MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
+    return dest->buffer().host != nullptr;
+}
+bool Utils::releaseMemoryForHostTensor(Tensor* dest) {
+    if (nullptr == dest->buffer().host) {
+        return true;
+    }
+    if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
+        return false;
+    }
+    MNNMemoryFreeAlign(dest->buffer().host);
+    dest->buffer().host = nullptr;
+    return true;
 }

 } // namespace Express
--- a/express/Utils.hpp
+++ b/express/Utils.hpp
@ -15,15 +15,16 @@
 namespace MNN {
 namespace Express {
 struct Expr::Inside {
-    std::vector<const Variable::Info*> mInputInfos;
+    Inside(int outputSize);
+    ~ Inside();
    std::vector<Variable::Info> mOutputInfos;
+    std::vector<Tensor*> mOutputTensors;
    Executor::Requirement mReq;
-    std::shared_ptr<Executor::ComputeCache::Unit> mUnit;
+    std::shared_ptr<Executor::Unit> mUnit;
    std::shared_ptr<Executor::ComputeCache> mCache;
    int mCacheOffset = 0;
    bool mInfoDirty = true;
    bool mContentDirty = true;
-    bool mLinkCache = false;
 };
 class Utils {
 public:
@ -33,6 +34,8 @@ public:
    static int convertFormat(Dimensionformat format);
    static Express::Dimensionformat revertFormat(int format);
    static halide_type_t revertDataType(DataType dataType);
+    static bool allocMemoryForHostTensor(Tensor* dest);
+    static bool releaseMemoryForHostTensor(Tensor* dest);
 };
 } // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.cpp
+++ b/tools/train/source/module/FixModule.cpp
@ -10,7 +10,7 @@
 #include <MNN/expr/ExprCreator.hpp>
 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 FixModule::FixModule(std::vector<Express::VARP> output, std::vector<Express::VARP> parameters,
                     std::vector<std::pair<Express::VARP, Express::Dimensionformat>> inputs) {
    for (auto p : parameters) {
@ -34,5 +34,19 @@ std::vector<Express::VARP> FixModule::onForward(const std::vector<Express::VARP>
    }
    return mOutput;
 }
-} // namespace Train
+
+Module* FixModule::clone(CloneContext* ctx) const {
+    FixModule* module(new FixModule);
+    for (auto& it : mInputs) {
+        VARP v = ctx->getOrClone(it.first);
+        module->mInputs.push_back(std::make_pair(v, it.second));
+    }
+    for (auto& it : mOutput) {
+        VARP v = ctx->getOrClone(it);
+        module->mOutput.push_back(v);
+    }
+    return this->cloneBaseTo(ctx, module);
+}
+
+} // namespace Express
 } // namespace MNN
--- a/tools/train/source/module/FixModule.hpp
+++ b/tools/train/source/module/FixModule.hpp
@ -8,9 +8,9 @@

 #ifndef FixModule_hpp
 #define FixModule_hpp
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {

 class FixModule : public Module {
 public:
@ -20,10 +20,14 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
    virtual void onClearCache() override;
 private:
+    FixModule() = default;
+
+    Module* clone(CloneContext* ctx) const override;
+
    std::vector<std::pair<Express::VARP, Express::Dimensionformat>> mInputs;
    std::vector<Express::VARP> mOutput;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN

 #endif
--- a/express/module/IfModule.cpp
+++ b/express/module/IfModule.cpp
@ -0,0 +1,112 @@
+//
+//  IfModule.cpp
+//  MNN
+//
+//  Created by MNN on 2020/09/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "IfModule.hpp"
+#include "MNN_generated.h"
+namespace MNN {
+namespace Express {
+static int _findPos(const std::vector<std::string>& names, const std::string& key) {
+    for (int i=0; i<names.size(); ++i) {
+        if (names[i] == key) {
+            return i;
+        }
+    }
+    return -1;
+}
+std::vector<Express::VARP> IfModule::onForward(const std::vector<Express::VARP>& inputs) {
+    std::vector<Express::VARP> outputs(mOutputFromElse.size());
+    MNN_ASSERT(mOutputFromThen.size() == mOutputFromElse.size());
+    if (inputs[0]->readMap<int>()[0] > 0) {
+        std::vector<Express::VARP> subInputs(mInputForThen.size());
+        for (auto& p : mInputForThen) {
+            subInputs[p.first] = inputs[p.second];
+        }
+        auto subOutputs = mThen->onForward(subInputs);
+        for (int i=0; i<mOutputFromThen.size(); ++i) {
+            outputs[i] = subOutputs[mOutputFromThen[i]];
+        }
+    } else {
+        std::vector<Express::VARP> subInputs(mInputForElse.size());
+        for (auto& p : mInputForElse) {
+            subInputs[p.first] = inputs[p.second];
+        }
+        auto subOutputs = mElse->onForward(subInputs);
+        for (int i=0; i<mOutputFromElse.size(); ++i) {
+            outputs[i] = subOutputs[mOutputFromElse[i]];
+        }
+    }
+    return outputs;
+}
+IfModule* IfModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
+    auto module = new IfModule;
+    auto ifParam = op->main_as_IfParam();
+    auto& thenG = subGraph.find(ifParam->then_graph()->str())->second;
+    auto& elseG = subGraph.find(ifParam->else_graph()->str())->second;
+    module->mElse = elseG.m;
+    module->mThen = thenG.m;
+    if (nullptr != op->name()) {
+        module->setName(op->name()->str());
+    }
+    /** Compute map index
+     std::vector<std::pair<int, int>> mInputForThen;
+
+     // First mElse' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForElse;
+         
+     std::vector<int> mOutputFromThen;
+     std::vector<int> mOutputFromElse;
+     */
+    // Map Inputs
+    for (int i=0; i<ifParam->aliases_inputs()->size(); ++i) {
+        auto index = i;
+        auto data = ifParam->aliases_inputs()->GetAs<StringVec>(i);
+        if (nullptr == data->data()) {
+            continue;
+        }
+        for (int s=0; s<data->data()->size(); ++s) {
+            auto name = data->data()->GetAsString(s)->str();
+            auto thenPos = _findPos(thenG.inputs, name);
+            if (thenPos >= 0) {
+                module->mInputForThen.emplace_back(std::make_pair(thenPos, i));
+            }
+            auto elsePos = _findPos(elseG.inputs, name);
+            if (elsePos >= 0) {
+                module->mInputForElse.emplace_back(std::make_pair(elsePos, i));
+            }
+        }
+    }
+    // Map outputs
+    auto output = ifParam->aliases_outputs();
+    module->mOutputFromThen.resize(output->size());
+    module->mOutputFromElse.resize(output->size());
+    for (int i=0; i<output->size(); ++i) {
+        auto data = output->GetAs<StringVec>(i);
+        MNN_ASSERT(data->data()->size() == 2);
+        
+        auto thenPos = _findPos(thenG.outputs, data->data()->GetAsString(0)->str());
+        MNN_ASSERT(thenPos >= 0);
+        auto elsePos = _findPos(elseG.outputs, data->data()->GetAsString(1)->str());
+        module->mOutputFromThen[i] = thenPos;
+        module->mOutputFromElse[i] = elsePos;
+    }
+    return module;
+}
+
+Module* IfModule::clone(CloneContext* ctx) const {
+    IfModule* module(new IfModule);
+    module->mInputForThen = mInputForThen;
+    module->mInputForElse = mInputForElse;
+    module->mOutputFromThen = mOutputFromThen;
+    module->mOutputFromElse = mOutputFromElse;
+    module->mThen.reset(mThen->clone(ctx));
+    module->mElse.reset(mElse->clone(ctx));
+    return this->cloneBaseTo(ctx, module);
+}
+
+}  // namespace Express
+}  // namespace MNN
--- a/express/module/IfModule.hpp
+++ b/express/module/IfModule.hpp
@ -0,0 +1,43 @@
+//
+//  IfModule.hpp
+//  MNN
+//
+//  Created by MNN on 2020/09/01.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef IfModule_hpp
+#define IfModule_hpp
+
+#include <MNN/expr/Module.hpp>
+namespace MNN {
+namespace Express {
+class IfModule : public Module {
+public:
+    virtual ~ IfModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+    static IfModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
+
+private:
+    IfModule(){}
+
+    Module* clone(CloneContext* ctx) const override;
+
+    // First mThen' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForThen;
+
+    // First mElse' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForElse;
+        
+    std::vector<int> mOutputFromThen;
+    std::vector<int> mOutputFromElse;
+
+    std::shared_ptr<Module> mThen;
+    std::shared_ptr<Module> mElse;
+};
+}
+}
+
+#endif /* IfModule_hpp */
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@ -0,0 +1,182 @@
+//
+//  Module.cpp
+//  MNN
+//
+//  Created by MNN on 2019/11/25.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "FixModule.hpp"
+#include "PipelineModule.hpp"
+#include "core/FileLoader.hpp"
+
+namespace MNN {
+namespace Express {
+
+class EmptyModule : public Module {
+public:
+    EmptyModule(const std::vector<Express::VARP>& parameters) {
+        for (auto p : parameters) {
+            addParameter(p);
+        }
+    }
+    virtual ~EmptyModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
+        return {};
+    }
+
+protected:
+    EmptyModule() = default;
+
+    Module* clone(Module::CloneContext* ctx) const override {
+        EmptyModule* module(new EmptyModule);
+        return this->cloneBaseTo(ctx, module);
+    }
+};
+
+Module* Module::createEmpty(const std::vector<Express::VARP>& parameters) {
+    return new EmptyModule(parameters);
+}
+
+Express::VARP Module::forward(Express::VARP input) {
+    return this->onForward({input})[0];
+}
+std::vector<Express::VARP> Module::parameters() const {
+    std::vector<Express::VARP> result;
+    _collectParameters(result);
+    return result;
+}
+bool Module::loadParameters(const std::vector<Express::VARP>& parameters) {
+    std::vector<Express::VARP> result;
+    _collectParameters(result);
+    if (parameters.empty() || parameters.size() != result.size()) {
+        MNN_ERROR("Error parameters, empty or parameter size not match \n");
+        return false;
+    }
+    for (int i=0; i<parameters.size(); ++i) {
+        if (nullptr != result[i].get()) {
+            // Check Origin parameter's size
+            auto dstInfo = result[i]->getInfo();
+            auto srcInfo = parameters[i]->getInfo();
+            if (dstInfo->dim.size() != srcInfo->dim.size() || dstInfo->order != srcInfo->order) {
+                MNN_ERROR("Error parameters %d, dim size or order not match \n", i);
+                return false;
+            }
+            if (dstInfo->size != srcInfo->size || dstInfo->type != srcInfo->type) {
+                MNN_ERROR("Error parameters %d, size or type not match \n", i);
+                return false;
+            }
+        }
+        Variable::replace(result[i], parameters[i]);
+    }
+    return true;
+}
+void Module::setIsTraining(const bool isTraining) {
+    mIsTraining = isTraining;
+    for (auto c : mChildren) {
+        c->setIsTraining(isTraining);
+    }
+}
+
+bool Module::getIsTraining() {
+    return mIsTraining;
+}
+
+void Module::registerModel(const std::vector<std::shared_ptr<Module>>& children) {
+    mChildren.insert(mChildren.begin(), children.begin(), children.end());
+}
+int Module::addParameter(VARP parameter) {
+    auto res = mParameters.size();
+    mParameters.emplace_back(parameter);
+    return (int)res;
+}
+
+void Module::setParameter(Express::VARP parameter, int index) {
+    if (index < 0 || index >= mParameters.size()) {
+        MNN_ERROR("Module error: index out of range: %d - %d:\n", index, (int)mParameters.size());
+        return;
+    }
+    mParameters[index] = parameter;
+}
+
+void Module::_collectParameters(std::vector<Express::VARP>& result) const {
+    for (auto p : mParameters) {
+        result.push_back(p);
+    }
+    for (auto c : mChildren) {
+        c->_collectParameters(result);
+    }
+}
+void Module::clearCache() {
+    for (auto c : mChildren) {
+        c->clearCache();
+    }
+    this->onClearCache();
+}
+
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic) {
+    AutoStorage<uint8_t> buffer;
+    {
+        FileLoader loader(fileName);
+        if (!loader.valid()) {
+            MNN_ERROR("Error for open %s\n", fileName);
+            return {};
+        }
+        loader.read();
+        if (!loader.valid()) {
+            return {};
+        }
+        loader.merge(buffer);
+        if (buffer.get() == nullptr) {
+            return {};
+        }
+    }
+    return load(inputs, outputs, buffer.get(), buffer.size(), dynamic);
+}
+
+Module* Module::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
+    return PipelineModule::load(inputs, outputs, buffer, length, dynamic);
+}
+
+EXPRP Module::CloneContext::getOrClone(EXPRP expr) {
+    auto it = mExprMap.find(expr.get());
+    if (it == mExprMap.end()) {
+        // EXPRP replica = expr->clone(shareParams);
+        // TODO(hjchen2): Clone expr.
+        EXPRP replica = expr;
+        it = mExprMap.emplace(expr.get(), replica).first;
+    }
+    return it->second;
+}
+
+VARP Module::CloneContext::getOrClone(VARP var) {
+    auto it = mVarMap.find(var.get());
+    if (it != mVarMap.end()) {
+        // TODO(hjchen2): Clone variable.
+        VARP replica = var;
+        it = mVarMap.emplace(var.get(), replica).first;
+    }
+    return it->second;
+}
+
+Module* Module::clone(const Module* module, const bool shareParams) {
+    CloneContext context(shareParams);
+    return module->clone(&context);
+}
+
+Module* Module::cloneBaseTo(CloneContext* ctx, Module* module) const {
+    for (const Express::VARP& var : mParameters) {
+        module->mParameters.push_back(ctx->getOrClone(var));
+    }
+    module->mIsTraining = mIsTraining;
+    module->mName = mName;
+    module->mType = mType;
+    return module;
+}
+
+} // namespace Express
+} // namespace MNN
--- a/tools/train/source/module/NN.cpp
+++ b/tools/train/source/module/NN.cpp
@ -6,9 +6,11 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //

-#include "NN.hpp"
+#include <MNN/expr/NN.hpp>
 #include "Distributions.hpp"
 #include "FixModule.hpp"
+#include "WhileModule.hpp"
+#include "IfModule.hpp"
 #include "Initializer.hpp"
 #include "MNN_generated.h"
 #include "RandomGenerator.hpp"
@ -17,7 +19,7 @@

 using namespace MNN::Express;
 namespace MNN {
-namespace Train {
+namespace Express {
 static VARP _activate(VARP x, NN::ActivationFunctionType type) {
    switch (type) {
        case NN::None:
@ -58,6 +60,14 @@ public:
    }

 private:
+    DropoutModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        DropoutModule* module(new DropoutModule);
+        module->mDropRatio = mDropRatio;
+        return this->cloneBaseTo(ctx, module);
+    }
+
    float mDropRatio;
 };

@ -80,8 +90,8 @@ public:
        mRunningVariance = _Const(bnPa->varData()->data(), {1, mChannels, 1, 1}, NCHW);
        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningVariancePos = addParameter(mRunningVariance);
+        mRunningMeanPos = addParameter(mRunningMean);
        mReductionDims = {0, 2, 3};
        setType("BatchNorm");
    }
@ -110,8 +120,8 @@ public:

        addParameter(mScale);
        addParameter(mBias);
-        addParameter(mRunningVariance);
-        addParameter(mRunningMean);
+        mRunningVariancePos = addParameter(mRunningVariance);
+        mRunningMeanPos = addParameter(mRunningMean);
        setType("BatchNorm");
    }

@ -156,9 +166,8 @@ public:
            mRunningVariance = _Const(mMomentum) * mRunningVariance + _Const(1 - mMomentum) * sampleVar;
            outputData->setName(name());
            outputData = _Convert(outputData, dimFormat);
-            Variable::prepareCompute({inputs[0], outputData, mRunningMean, mRunningVariance});
-            mRunningMean.fix(Express::VARP::CONSTANT);
-            mRunningVariance.fix(Express::VARP::CONSTANT);
+            setParameter(mRunningMean, mRunningMeanPos);
+            setParameter(mRunningVariance, mRunningVariancePos);
            return {outputData};
        }
        auto rStd  = _Const(1.0f) / _Sqrt(mRunningVariance + _Const(mEps));
@ -180,12 +189,31 @@ public:
    }

 private:
+    BatchNormModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        BatchNormModule* module(new BatchNormModule);
+        module->mMomentum = mMomentum;
+        module->mEps = mEps;
+        module->mScale = ctx->getOrClone(mScale);
+        module->mBias = ctx->getOrClone(mBias);
+        module->mRunningMean = ctx->getOrClone(mRunningMean);
+        module->mRunningVariance = ctx->getOrClone(mRunningVariance);
+        module->mRunningMeanPos = mRunningMeanPos;
+        module->mRunningVariancePos = mRunningVariancePos;
+        module->mChannels = mChannels;
+        module->mReductionDims = mReductionDims;
+        return this->cloneBaseTo(ctx, module);
+    }
+
    float mMomentum       = 0.99;
    float mEps            = 1e-5;
    VARP mScale           = nullptr;
    VARP mBias            = nullptr;
    VARP mRunningMean     = nullptr;
    VARP mRunningVariance = nullptr;
+    int mRunningMeanPos = -1;
+    int mRunningVariancePos = -1;
    int mChannels;
    std::vector<int> mReductionDims;
 };
@ -246,7 +274,18 @@ public:
        tempOutput->setName(name());
        return {tempOutput};
    }
+
 private:
+    ConvModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvModule* module(new ConvModule);
+        module->mParameter = mParameter;
+        module->mParameter.weight = ctx->getOrClone(mParameter.weight);
+        module->mParameter.bias = ctx->getOrClone(mParameter.bias);
+        return this->cloneBaseTo(ctx, module);
+    }
+
    NN::ConvParameters mParameter;
 };
 static std::tuple<VARP, VARP, int> _initParameters(const NN::ConvOption& option, bool hasBias,
@ -533,7 +572,23 @@ public:
    }

 private:
-    const NN::ConvOption mOption;
+    ConvOctaveModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvOctaveModule* module(new ConvOctaveModule);
+        module->mOption = mOption;
+        module->mLLW = ctx->getOrClone(mLLW);
+        module->mLHW = ctx->getOrClone(mLHW);
+        module->mHLW = ctx->getOrClone(mHLW);
+        module->mHHW = ctx->getOrClone(mHHW);
+        module->mLBias = ctx->getOrClone(mLBias);
+        module->mHBias = ctx->getOrClone(mHBias);
+        module->mSplitInput = mSplitInput;
+        module->mGroup = mGroup;
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    NN::ConvOption mOption;
    VARP mLLW;
    VARP mLHW;
    VARP mHLW;
@ -555,7 +610,7 @@ Module* NN::ConvOctave(const ConvParameters& parameters,
    module->setName(parameters.name);
    return module;
 }
-Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
+Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs) {
    if (nullptr == expr->get()) {
        return nullptr;
    }
@ -565,6 +620,12 @@ Module* NN::Utils::ExtractNotRunableOp(Express::EXPRP expr) {
    if (expr->get()->type() == OpType_Dropout) {
        return new DropoutModule(0.3f);
    }
+    if (expr->get()->type() == OpType_While) {
+        return WhileModule::create(expr->get(), subgraphs);
+    }
+    if (expr->get()->type() == OpType_If) {
+        return IfModule::create(expr->get(), subgraphs);
+    }
    return nullptr;
 }

@ -622,6 +683,9 @@ public:
        mLimitScale = _Scalar<float>(1.0f / limit);
        mClampValue = _Scalar<float>(limit);
        
+        mInputScalePos = addParameter(mInputScale);
+        mOutputScalePos = addParameter(mOutputScale);
+
        setType("ConvBNReluFused");
    }

@ -632,31 +696,16 @@ public:
            tempX = _Convert(tempX, NCHW);
        }
        auto originX = tempX;
-        VARP scale;
-        if (mFeatureScaleStatMethod == NN::PerTensor) {
-            scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
-        } else {
-            auto originSize = originX->getInfo()->size;
-            auto batch = originX->getInfo()->dim[0];
-            auto channel = originX->getInfo()->dim[1];
-            if (originSize / batch / channel < 10) {
-                // Too small data
-                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
-                std::vector<int> dims = {1, channel, 1, 1};
-                auto dimVar = _Const(dims.data(), {4}, NCHW, halide_type_of<int32_t>());
-                auto singleScale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
-                scale = _Fill(dimVar, singleScale);
-            } else {
-                //MNN_PRINT("%d - %d - %d\n", originSize, batch, channel);
-                scale = _Maximum(_ReduceMax(_Abs(tempX), {0, 2, 3}, true), _Scalar<float>(0.0001f)) * mLimitScale;
-            }
-        }
-        scale.fix(VARP::CONSTANT);
+        VARP scale = _Maximum(_ReduceMax(_Abs(tempX)), _Scalar<float>(0.0001f)) * mLimitScale;
        if (useScale == nullptr) {
            tempX = _Round(tempX * _Reciprocal(scale)) * scale;
        } else {
            tempX = _Round(tempX * _Reciprocal(useScale)) * useScale;
        }
+        // Break the grad by use cast
+        tempX = _Cast<float>(tempX);
+        
+        // Move grad from tempX to originX
        tempX = _Convert(tempX + _ZeroGrad(originX), originFormat);
        return std::make_pair(tempX, scale);
    }
@ -684,18 +733,16 @@ public:
    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override {
        VARP res;
        if (getIsTraining()) {
-            Variable::prepareCompute({inputs[0]});
            auto x = _Convert(inputs[0], NCHW);
            // simulate weight quant
            auto weightScale = _Maximum(_ReduceMax(_Abs(mWeight), {1, 2, 3}, true), _Scalar<float>(1E-6)) * mLimitScale;
-            weightScale.fix(VARP::CONSTANT);
            auto weightTemp = _Round(mWeight * _Reciprocal(weightScale)) * weightScale;
            weightTemp = weightTemp + _ZeroGrad(mWeight);

            // simulate input quant to get original input scale
            auto inputPair  = fakeQuantFeature(x);
            mInputScale = updateScale(mInputScale, inputPair.second);
-            mInputScale.fix(VARP::CONSTANT);
+            setParameter(mInputScale, mInputScalePos);

            // simulate output quant to get original output scale
            res = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -709,10 +756,9 @@ public:

            res = _activate(res, mActivation);

-            Variable::prepareCompute({conv, res});
            auto outputPair = fakeQuantFeature(res);
            mOutputScale = updateScale(mOutputScale, outputPair.second);
-            mOutputScale.fix(VARP::CONSTANT);
+            setParameter(mOutputScale, mOutputScalePos);
            res = outputPair.first;
        } else {
            if (nullptr == mInputScale) {
@ -725,6 +771,7 @@ public:
                auto x = _Convert(inputs[0], NCHW);
                auto inputPair  = fakeQuantFeature(x);
                mInputScale     = inputPair.second;
+                setParameter(mInputScale, mInputScalePos);
                inputPair.first.fix(VARP::CONSTANT);

                auto simuRes = _Conv(weightTemp, mBias, _Convert(inputPair.first, NC4HW4), mOption.padMode, mOption.stride,
@ -737,6 +784,7 @@ public:
                Variable::prepareCompute({simuRes});
                auto outputPair = fakeQuantFeature(simuRes);
                mOutputScale    = outputPair.second;
+                setParameter(mOutputScale, mOutputScalePos);
                outputPair.first.fix(VARP::CONSTANT);
            }

@ -772,12 +820,7 @@ public:
            {
                std::vector<int> dims = {x->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
-                if (mFeatureScaleStatMethod == NN::PerTensor) {
-                    channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
-                } else {
-                    channelScale = _Reciprocal(mInputScale);
-                }
+                VARP channelScale = _Reciprocal(_Fill(dimVar, mInputScale));
                x = _FloatToInt8(x, channelScale, -127, 127);// TODO add clamp
            }

@ -824,12 +867,7 @@ public:
            {
                std::vector<int> dims = {res->getInfo()->dim[1]};
                auto dimVar = _Const(dims.data(), {1}, NCHW, halide_type_of<int32_t>());
-                VARP channelScale;
-                if (mFeatureScaleStatMethod == NN::PerTensor) {
-                    channelScale = _Fill(dimVar, mOutputScale);
-                } else {
-                    channelScale = mOutputScale;
-                }
+                VARP channelScale = _Fill(dimVar, mOutputScale);
                res  = _Int8ToFloat(res, channelScale);
            }
        }
@ -838,6 +876,34 @@ public:
    }

 private:
+    ConvBNReluFusedModule() = default;
+
+    Module* clone(CloneContext* ctx) const override {
+        ConvBNReluFusedModule* module(new ConvBNReluFusedModule);
+        module->mConvParameter = mConvParameter;
+        module->mConvParameter.weight = ctx->getOrClone(mConvParameter.weight);
+        module->mConvParameter.bias = ctx->getOrClone(mConvParameter.bias);
+        module->mOption = mOption;
+        module->mGroup = mGroup;
+        module->mWeight = ctx->getOrClone(mWeight);
+        module->mBias = ctx->getOrClone(mBias);
+        module->mActivation = mActivation;
+        module->mLimitScale = ctx->getOrClone(mLimitScale);
+        module->mInputScalePos = mInputScalePos;
+        module->mOutputScalePos = mOutputScalePos;
+        module->mInputScale = ctx->getOrClone(mInputScale);
+        module->mOutputScale = ctx->getOrClone(mOutputScale);
+        module->mClampValue = ctx->getOrClone(mClampValue);
+        module->mMomentum = mMomentum;
+        module->mFeatureScaleStatMethod = mFeatureScaleStatMethod;
+        module->mScaleUpdateMethod = mScaleUpdateMethod;
+        if (mBatchNorm) {
+            module->mBatchNorm.reset(mBatchNorm->clone(ctx));
+            module->registerModel({module->mBatchNorm});
+        }
+        return this->cloneBaseTo(ctx, module);
+    }
+
    NN::ConvParameters mConvParameter;
    NN::ConvOption mOption;
    int mGroup;
@ -846,6 +912,8 @@ private:
    NN::ActivationFunctionType mActivation = NN::ActivationFunctionType::None;
    std::shared_ptr<Module> mBatchNorm = nullptr;
    VARP mLimitScale;
+    int mInputScalePos = -1;
+    int mOutputScalePos = -1;
    VARP mInputScale = nullptr;
    VARP mOutputScale = nullptr;
    VARP mClampValue;
@ -870,5 +938,5 @@ Module* NN::ConvInt8(const ConvParameters& para, int bits, NN::FeatureScaleStatM
    return new ConvBNReluFusedModule({conv}, featureMethod, method, bits);
 }

-} // namespace Train
+} // namespace Express
 } // namespace MNN
--- a/express/module/PipelineModule.cpp
+++ b/express/module/PipelineModule.cpp
@ -0,0 +1,761 @@
+//
+//  PipelineModule.cpp
+//  MNN
+//
+//  Created by MNN on 2020/01/09.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "PipelineModule.hpp"
+#include "MNN_generated.h"
+#include <set>
+#include <vector>
+#include "StaticModule.hpp"
+#include "IfModule.hpp"
+#include "WhileModule.hpp"
+using namespace MNN::Express;
+namespace MNN {
+namespace Express {
+//#define DYNAMIC
+#define PIPELINE_MODULE "_pipeline_module__"
+class ExprModule : public Module {
+public:
+    ExprModule(EXPRP expr) {
+        mExpr   = expr;
+        setName(expr->name());
+        mInputs = expr->inputs();
+        auto op = mExpr->get();
+        if (op) {
+            auto typeName = EnumNameOpType(op->type());
+            setType(typeName);
+        }
+        for (int i = 0; i < mInputs.size(); ++i) {
+            auto inputExpr = mInputs[i]->expr().first;
+            if (inputExpr->get() != nullptr) {
+                mInputs[i] = nullptr;
+                mInputIndexes.emplace_back(i);
+                continue;
+            }
+            switch (inputExpr->inputType()) {
+                case VARP::INPUT:
+                    mInputs[i] = nullptr;
+                    mInputIndexes.emplace_back(i);
+                    break;
+                case VARP::CONSTANT:
+                    break;
+                case VARP::TRAINABLE:
+                    addParameter(mInputs[i]);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    virtual std::vector<VARP> onForward(const std::vector<VARP>& inputs) override {
+        MNN_ASSERT(mInputIndexes.size() == inputs.size());
+        if (nullptr == mExpr->get()) {
+            return {Variable::create(mExpr)};
+        }
+        std::vector<VARP> tempInputs = mInputs;
+        for (int i = 0; i < inputs.size(); ++i) {
+            tempInputs[mInputIndexes[i]] = inputs[i];
+        }
+        std::vector<VARP> outputVars;
+        auto newExpr = Expr::create(mExpr->extra(), std::move(tempInputs), mExpr->outputSize());
+        newExpr->setName(mExpr->name());
+        for (int i = 0; i < mExpr->outputSize(); ++i) {
+            outputVars.emplace_back(Variable::create(newExpr, i));
+        }
+        return outputVars;
+    }
+    const std::vector<int>& inputIndexes() const {
+        return mInputIndexes;
+    }
+
+private:
+    Module* clone(CloneContext* ctx) const override {
+        ExprModule* module(new ExprModule(ctx->getOrClone(mExpr)));
+        for (const VARP& var : mInputs) {
+            module->mInputs.push_back(ctx->getOrClone(var));
+        }
+        module->mInputIndexes = mInputIndexes;
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    EXPRP mExpr;
+    std::vector<VARP> mInputs;
+    std::vector<int> mInputIndexes;
+};
+
+Module* PipelineModule::extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph) {
+    std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(EXPRP)> transformFunction;
+    if (fortrain) {
+        transformFunction =
+        [&subGraph](EXPRP source) {
+            if (source->get() == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
+            if (nullptr != m) {
+                m->setName(source->name());
+                return std::make_pair(std::vector<int>{}, m);
+            }
+            auto convExtracted = NN::Utils::ExtractConvolution(source);
+            if (convExtracted.weight == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> module(NN::Conv(convExtracted));
+            module->setName(source->name());
+            return std::make_pair(std::vector<int>{0}, module);
+        };
+    } else {
+        transformFunction = [&subGraph](EXPRP source) {
+            if (source->get() == nullptr) {
+                return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+            }
+            std::shared_ptr<Module> m(NN::Utils::ExtractNotRunableOp(source, subGraph));
+            if (nullptr != m) {
+                m->setName(source->name());
+                return std::make_pair(std::vector<int>{}, m);
+            }
+            return std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+        };
+    }
+    return new PipelineModule(inputs, outputs, transformFunction);
+}
+
+PipelineModule::PipelineModule(std::vector<VARP> inputs, std::vector<VARP> outputs, const Transformer& transformFunction) {
+    setType(PIPELINE_MODULE);
+    std::vector<EXPRP> executeOrder;
+    std::set<EXPRP> inputExpr;
+    for (auto v : inputs) {
+        inputExpr.insert(v->expr().first);
+    }
+    for (auto output : outputs) {
+        Expr::visit(output->expr().first,
+        [&executeOrder, &inputExpr](EXPRP expr) {
+            if (expr->visited()) {
+                return false;
+            }
+            if (inputExpr.find(expr)!= inputExpr.end()) {
+                expr->setVisited(true);
+                executeOrder.emplace_back(expr);
+                return false;
+            }
+            return true;
+        },
+        [&executeOrder](EXPRP expr) {
+            //FUNC_PRINT_ALL(var->name().c_str(), s);
+            if (!expr->visited()) {
+                executeOrder.emplace_back(expr);
+                expr->setVisited(true);
+            }
+            return true;
+        });
+    }
+    for (auto expr : executeOrder) {
+        expr->setVisited(false);
+    }
+    // Set Indexes
+    std::map<EXPRP, int> indexes;
+    int currentIndexes = 0;
+    for (auto expr : executeOrder) {
+        indexes[expr] = currentIndexes;
+        currentIndexes += expr->outputSize();
+    }
+    std::set<EXPRP> inputSets;
+    mInputIndexes.clear();
+    mStackSize = currentIndexes;
+    for (auto v : inputs) {
+        auto inputExpr = v->expr();
+        mInputIndexes.emplace_back(indexes[inputExpr.first] + inputExpr.second);
+        inputSets.insert(inputExpr.first);
+    }
+
+    // Create All SubModule
+    for (auto expr : executeOrder) {
+        if (inputSets.find(expr) != inputSets.end()) {
+            continue;
+        }
+        std::pair<std::vector<int>, std::shared_ptr<Module> > moduleResult;
+        bool extracted = false;
+        if (!transformFunction) {
+            moduleResult = std::make_pair(std::vector<int>{}, std::shared_ptr<Module>(nullptr));
+        } else {
+            moduleResult = transformFunction(expr);
+        }
+        if (moduleResult.second == nullptr) {
+            std::shared_ptr<Module> module(new ExprModule(expr));
+            moduleResult.first  = ((ExprModule*)module.get())->inputIndexes();
+            moduleResult.second = module;
+        } else {
+            extracted = true;
+        }
+        auto subInputs        = expr->inputs();
+        auto& exprInputIndexes = moduleResult.first;
+        std::vector<int> inputIndexes;
+        if (exprInputIndexes.empty() && extracted) {
+            inputIndexes.resize(subInputs.size());
+            for (int i = 0; i < inputIndexes.size(); ++i) {
+                auto inputExpr  = subInputs[i]->expr();
+                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
+            }
+        } else {
+            inputIndexes.resize(exprInputIndexes.size());
+            for (int i = 0; i < inputIndexes.size(); ++i) {
+                auto inputExpr  = subInputs[exprInputIndexes[i]]->expr();
+                inputIndexes[i] = indexes[inputExpr.first] + inputExpr.second;
+            }
+        }
+        std::vector<int> outputIndexes(expr->outputSize());
+        for (int i = 0; i < outputIndexes.size(); ++i) {
+            outputIndexes[i] = indexes[expr] + i;
+        }
+        mSubModules.emplace_back(std::make_tuple(moduleResult.second, inputIndexes, outputIndexes));
+        registerModel({moduleResult.second});
+    }
+    mOutputIndexes.clear();
+    for (auto output : outputs) {
+        auto outputExpr = output->expr();
+        mOutputIndexes.emplace_back(indexes[outputExpr.first] + outputExpr.second);
+    }
+}
+bool PipelineModule::turnQuantize(Module* module, const int bit, NN::FeatureScaleStatMethod featureScaleStatMethod, NN::ScaleUpdateMethod scaleUpdateMethod) {
+    if (nullptr == module || module->type() != PIPELINE_MODULE) {
+        MNN_ERROR("Invalide module for quantized\n");
+        return false;
+    }
+    ((PipelineModule*)module)->toTrainQuant(bit, featureScaleStatMethod, scaleUpdateMethod);
+    return true;
+}
+
+std::vector<int> PipelineModule::countOutputReference(std::vector<int> outputIndices) {
+    MNN_ASSERT(outputIndices.size() > 0);
+    std::vector<int> countResult(outputIndices.size(), 0);
+
+    for (int i = 0; i < mSubModules.size(); i++) {
+        auto &m = mSubModules[i];
+        auto& theModule = std::get<0>(m);
+        auto name = theModule->name();
+        auto &inputIndices = std::get<1>(m);
+
+        for (int j = 0; j < inputIndices.size(); j++) {
+            int index = inputIndices[j];
+            for (int k = 0; k < countResult.size(); k++) {
+                if (index == outputIndices[k]) {
+                    countResult[k]++;
+                }
+            }
+        }
+    }
+
+    return countResult;
+}
+
+void PipelineModule::toTrainQuant(const int bits, NN::FeatureScaleStatMethod featureScaleStatMethod,
+                                        NN::ScaleUpdateMethod scaleUpdateMethod) {
+    std::vector<int> needEraseIndices;
+
+    for (int i = 0; i < mSubModules.size(); i++) {
+        auto& m = mSubModules[i];
+        auto& theModule = std::get<0>(m);
+        auto moduleType = theModule->type();
+        //auto& inputIndices = std::get<1>(m);
+        auto& outputIndices = std::get<2>(m);
+
+        if (moduleType == "Conv" && i < mSubModules.size() - 1) {
+            auto& p1 = mSubModules[i+1];
+            auto p1Module = std::get<0>(p1);
+            auto& p1ModuleType = p1Module->type();
+            auto& p1InputIndices = std::get<1>(p1);
+            auto& p1OutputIndices = std::get<2>(p1);
+
+            auto convOutputCount = countOutputReference(outputIndices);
+            bool convSingleOutputReference = ((outputIndices.size() == 1) && (convOutputCount[0] == 1));
+
+            // only conv
+            if ((!convSingleOutputReference) || (p1ModuleType == "Conv") ||
+                    (p1ModuleType != "BatchNorm" && p1ModuleType != "ReLU" && p1ModuleType != "ReLU6")) {
+                theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                registerModel({theModule});
+                continue;
+            }
+            // conv + bn + ?
+            if (p1ModuleType == "BatchNorm") {
+                bool convBnConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
+                if (!convBnConnected) {
+                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    continue;
+                }
+
+                // last conv + bn
+                if (i == mSubModules.size() - 2) {
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p1OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    continue;
+                }
+                // maybe there is a relu or relu6 after conv + bn
+                auto& p2 = mSubModules[i+2];
+                auto& p2Module = std::get<0>(p2);
+                auto p2ModuleType = p2Module->type();
+                auto& p2InputIndices = std::get<1>(p2);
+                auto& p2OutputIndices = std::get<2>(p2);
+
+                auto bnOutputCount = countOutputReference(p1OutputIndices);
+                bool bnSingleOutputReference = ((p1OutputIndices.size() == 1) && (bnOutputCount[0] == 1));
+
+                // only conv + bn
+                if ((!bnSingleOutputReference) || (p2ModuleType != "ReLU" && p2ModuleType != "ReLU6")) {
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p1OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    continue;
+                } else { // conv + bn + relu or conv + bn + relu6
+                    bool convBnReluConnected = ((bnSingleOutputReference) && (p2InputIndices.size() == 1) && (p2InputIndices[0] == p1OutputIndices[0]));
+                    if (!convBnReluConnected) {
+                        theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                        registerModel({theModule});
+                        outputIndices = p1OutputIndices;
+                        needEraseIndices.emplace_back(i + 1);
+                        continue;
+                    }
+
+                    theModule.reset(NN::ConvBNReluFused({theModule, p1Module, p2Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    outputIndices = p2OutputIndices;
+                    needEraseIndices.emplace_back(i + 1);
+                    needEraseIndices.emplace_back(i + 2);
+                    continue;
+                }
+            }
+            // conv + relu or conv + relu6
+            if (p1ModuleType == "ReLU" || p1ModuleType == "ReLU6") {
+                bool convReluConnected = ((convSingleOutputReference) && (p1InputIndices.size() == 1) && (p1InputIndices[0] == outputIndices[0]));
+                if (!convReluConnected) {
+                    theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                    registerModel({theModule});
+                    continue;
+                }
+
+                theModule.reset(NN::ConvBNReluFused({theModule, p1Module}, featureScaleStatMethod, scaleUpdateMethod, bits));
+                registerModel({theModule});
+                outputIndices = p1OutputIndices;
+                needEraseIndices.emplace_back(i + 1);
+                continue;
+            }
+        }
+
+        if (i == mSubModules.size() - 1 && moduleType == "Conv") {
+            theModule.reset(NN::ConvBNReluFused({theModule}, featureScaleStatMethod, scaleUpdateMethod, bits));
+            registerModel({theModule});
+        }
+    }
+
+    // erase useless submodules
+    const int eraseSize = needEraseIndices.size();
+    int alreadyErasedCount = 0;
+    for (int i = 0; i < eraseSize; i++) {
+        auto position = needEraseIndices[i] - alreadyErasedCount;
+        auto type = std::get<0>(mSubModules[position])->type();
+        MNN_ASSERT(type == "BatchNorm" || type == "ReLU" || type == "ReLU6");
+        mSubModules.erase(mSubModules.begin() + position);
+        alreadyErasedCount++;
+    }
+}
+
+std::vector<VARP> PipelineModule::onForward(const std::vector<VARP>& inputs) {
+    std::vector<VARP> mStack(mStackSize);
+    for (int i = 0; i < mInputIndexes.size(); ++i) {
+        mStack[mInputIndexes[i]] = inputs[i];
+    }
+    for (int index = 0; index < mSubModules.size(); ++index) {
+        auto& m = mSubModules[index];
+        std::vector<VARP> tempInputs(std::get<1>(m).size());
+        for (int i = 0; i < tempInputs.size(); ++i) {
+            tempInputs[i] = mStack[std::get<1>(m)[i]];
+            MNN_ASSERT(nullptr != tempInputs[i]);
+        }
+        std::vector<VARP> tempOutputs = std::get<0>(m)->onForward(tempInputs);
+        MNN_ASSERT(tempOutputs.size() == std::get<2>(m).size());
+        for (int i = 0; i < tempOutputs.size(); ++i) {
+            mStack[std::get<2>(m)[i]] = tempOutputs[i];
+            MNN_ASSERT(nullptr != tempOutputs[i]);
+        }
+    }
+    std::vector<VARP> outputs(mOutputIndexes.size());
+    for (int i = 0; i < mOutputIndexes.size(); ++i) {
+        outputs[i] = mStack[mOutputIndexes[i]];
+    }
+    return outputs;
+}
+void PipelineModule::onClearCache() {
+    // Do nothing
+}
+
+static std::map<std::string, SubGraph> _createSubGraph(const MNN::Net* net, bool dynamic) {
+    std::map<std::string, SubGraph> subGraphMap;
+    auto subGraphs = net->subgraphs();
+    if (nullptr == subGraphs) {
+        return subGraphMap;
+    }
+    for (int i=0; i<subGraphs->size(); ++i) {
+        auto graph = subGraphs->GetAs<SubGraphProto>(i);
+        std::vector<std::string> subInputs;
+        std::vector<std::string> subOutputs;
+        if (nullptr != graph->inputs()) {
+            for (int v=0; v<graph->inputs()->size(); ++v) {
+                auto index = graph->inputs()->data()[v];
+                subInputs.emplace_back(graph->tensors()->GetAsString(index)->str());
+            }
+        }
+        for (int v=0; v<graph->outputs()->size(); ++v) {
+            auto index = graph->outputs()->data()[v];
+            subOutputs.emplace_back(graph->tensors()->GetAsString(index)->str());
+        }
+        // Pack to Net for loading
+        std::shared_ptr<Module> submodule;
+        {
+            std::unique_ptr<SubGraphProtoT> _tempInfo(graph->UnPack());
+            std::unique_ptr<NetT> _tempNet(new NetT);
+            _tempNet->oplists = std::move(_tempInfo->nodes);
+            _tempNet->tensorName = std::move(_tempInfo->tensors);
+            flatbuffers::FlatBufferBuilder builder(1024);
+            auto offset = Net::Pack(builder, _tempNet.get());
+            builder.Finish(offset);
+            if (dynamic) {
+                submodule.reset(PipelineModule::load(subInputs, subOutputs, (const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), dynamic));
+            } else {
+                submodule.reset(new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), subInputs, subOutputs));
+            }
+            if (graph->name() != nullptr) {
+                submodule->setName(graph->name()->str());
+            }
+        }
+        auto key = graph->name()->str();
+        SubGraph subgraph;
+        subgraph.inputs = std::move(subInputs);
+        subgraph.outputs = std::move(subOutputs);
+        subgraph.m = submodule;
+        subGraphMap.insert(std::make_pair(key, subgraph));
+    }
+    return subGraphMap;
+}
+
+struct SubModuleInfo {
+    std::vector<int> opList;
+    std::vector<int> inputs;;
+    std::vector<int> outputs;
+    std::vector<uint8_t> tensorMask;
+};
+static std::vector<SubModuleInfo> _createSubModuleInfo(const MNN::Net* net, const std::set<int>& inputIndexes, const std::set<int>& outputIndexes) {
+    std::vector<SubModuleInfo> submodule;
+    SubModuleInfo current;
+    std::vector<int> inputOps;
+
+    // Seperate the graph to serveral submodule
+    for (int i=0; i<net->oplists()->size(); ++i) {
+        auto op = net->oplists()->GetAs<Op>(i);
+        // Collect Input
+        if (op->type() == OpType_Input) {
+            inputOps.emplace_back(i);
+            continue;
+        }
+        if (op->type() == OpType_If || op->type() == OpType_While) {
+            if (current.opList.size() > 0) {
+                // Not empty
+                submodule.emplace_back(std::move(current));
+            }
+            SubModuleInfo controlOp;
+            controlOp.opList = {i};
+            submodule.emplace_back(std::move(controlOp));
+            continue;
+        }
+        current.opList.emplace_back(i);
+    }
+    if (!current.opList.empty()) {
+        submodule.emplace_back(std::move(current));
+    }
+
+    /**Compute All SubModule's inputs and outputs*/
+    // 0: not use, 1: input, 2: output, 3: mid, 4: valid output
+    for (int moduleIndex=0; moduleIndex < submodule.size(); ++moduleIndex) {
+        auto& m = submodule[moduleIndex];
+        if (1 == m.opList.size()) {
+            // Fast way to determine
+            auto op = net->oplists()->GetAs<Op>(m.opList[0]);
+            if (nullptr != op->inputIndexes()) {
+                m.inputs.resize(op->inputIndexes()->size());
+                ::memcpy(m.inputs.data(), op->inputIndexes()->data(), m.inputs.size() * sizeof(int));
+            }
+            if (nullptr != op->outputIndexes()) {
+                m.outputs.resize(op->outputIndexes()->size());
+                ::memcpy(m.outputs.data(), op->outputIndexes()->data(), m.outputs.size() * sizeof(int));
+            }
+        } else {
+            m.tensorMask = std::vector<uint8_t>(net->tensorName()->size(), 0);
+            auto& tensorMask = m.tensorMask;
+            for (auto opIndex : m.opList) {
+                auto op = net->oplists()->GetAs<Op>(opIndex);
+                if (nullptr != op->inputIndexes()) {
+                    for (int v=0; v<op->inputIndexes()->size(); ++v) {
+                        auto index = op->inputIndexes()->data()[v];
+                        tensorMask[index] = tensorMask[index] | 1;
+                    }
+                }
+                if (nullptr != op->outputIndexes()) {
+                    for (int v=0; v<op->outputIndexes()->size(); ++v) {
+                        auto index = op->outputIndexes()->data()[v];
+                        tensorMask[index] = tensorMask[index] | 2;
+                    }
+                }
+            }
+            for (int i=0; i<tensorMask.size(); ++i) {
+                if (0 == tensorMask[i]) {
+                    continue;
+                }
+                if (1 == tensorMask[i]) {
+                    m.inputs.emplace_back(i);
+                    continue;
+                }
+                if (2 == tensorMask[i]) {
+                    m.outputs.emplace_back(i);
+                    continue;
+                }
+                if (3 == tensorMask[i]) {
+                    if (outputIndexes.find(i) != outputIndexes.end()) {
+                        m.outputs.emplace_back(i);
+                    }
+                }
+            }
+        }
+        // Check if the module's input is valid
+        for (int i=0; i<m.inputs.size(); ++i) {
+            auto index = m.inputs[i];
+            if (inputIndexes.find(index) != inputIndexes.end()) {
+                continue;
+            }
+            bool find = false;
+            for (int sub=0; sub < moduleIndex; ++sub) {
+                for (auto out : submodule[sub].outputs) {
+                    if (out == index) {
+                        find = true;
+                        break;
+                    }
+                }
+                if (find) {
+                    break;
+                }
+            }
+            if (find) {
+                continue;
+            }
+            // Find from module
+            for (int sub=0; sub < moduleIndex; ++sub) {
+                if (submodule[sub].tensorMask.empty()) {
+                    continue;
+                }
+                if (submodule[sub].tensorMask[index] == 2) {
+                    find = true;
+                    break;
+                }
+                if (submodule[sub].tensorMask[index] == 3) {
+                    submodule[sub].outputs.emplace_back(index);
+                    submodule[sub].tensorMask[index] = 2;
+                    find = true;
+                    break;
+                }
+            }
+            MNN_ASSERT(find);
+        }
+    }
+    for (auto& m : submodule) {
+        m.tensorMask.clear();
+    }
+    return submodule;
+}
+
+static Module* _createSubModule(const MNN::Net* net, const SubModuleInfo& info, const std::map<std::string, SubGraph>& subs) {
+    if (1 == info.opList.size()) {
+        auto op = net->oplists()->GetAs<Op>(info.opList[0]);
+        if (OpType_If == op->type()) {
+            return IfModule::create(op, subs);
+        }
+        if (OpType_While == op->type()) {
+            return WhileModule::create(op, subs);
+        }
+        MNN_ASSERT(false);
+    }
+    std::unique_ptr<NetT> _tempNet(new NetT);
+    // Copy Tensor Name
+    _tempNet->tensorName.resize(net->tensorName()->size());
+    for (int i=0; i<net->tensorName()->size(); ++i) {
+        _tempNet->tensorName[i] = net->tensorName()->GetAsString(i)->str();
+    }
+    // Create Input node
+    std::vector<std::string> inputNames;
+    for (auto index : info.inputs) {
+        std::unique_ptr<OpT> inputOp(new OpT);
+        inputOp->outputIndexes = {index};
+        inputOp->type = OpType_Input;
+        inputOp->main.type = OpParameter_Input;
+        inputOp->main.value = new InputT;
+        inputOp->main.AsInput()->dims = {0, 0, -1, -1};
+        _tempNet->oplists.emplace_back(std::move(inputOp));
+        inputNames.emplace_back(_tempNet->tensorName[index]);
+    }
+    // Create compute node
+    for (auto opIndex : info.opList) {
+        std::unique_ptr<OpT> op(net->oplists()->GetAs<Op>(opIndex)->UnPack());
+        _tempNet->oplists.emplace_back(std::move(op));
+    }
+    // Get output names
+    std::vector<std::string> outputNames;
+    for (auto index : info.outputs) {
+        outputNames.emplace_back(_tempNet->tensorName[index]);
+    }
+    // Create Net Buffer
+    flatbuffers::FlatBufferBuilder builder(1024);
+    auto offset = Net::Pack(builder, _tempNet.get());
+    builder.Finish(offset);
+    _tempNet.reset();
+    return new StaticModule((const uint8_t*)builder.GetBufferPointer(), builder.GetSize(), inputNames, outputNames);
+}
+
+Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic) {
+    // Create Subgraph
+    auto net = GetNet(buffer);
+    auto subGraphs = net->subgraphs();
+    if (nullptr == net->oplists() || nullptr == net->tensorName()) {
+        MNN_ERROR("Invalid net, for null oplist or tensorName\n");
+        return nullptr;
+    }
+    if (!dynamic) {
+        if (nullptr == subGraphs) {
+            // Has no control flow, can just use static module
+            return new StaticModule(buffer, length, inputs, outputs);
+        }
+    }
+    auto subGraphMap = _createSubGraph(net, dynamic);
+    if (dynamic) {
+        // For dynamic mode
+        auto varMaps = Variable::loadMap(buffer, length);
+        std::vector<VARP> inputVars(inputs.size());
+        for (int i=0; i<inputs.size(); ++i) {
+            inputVars[i] = varMaps[inputs[i]];
+        }
+        std::vector<VARP> outputVars(outputs.size());
+        for (int i=0; i<outputs.size(); ++i) {
+            outputVars[i] = varMaps[outputs[i]];
+        }
+        return extract(inputVars, outputVars, false, subGraphMap);
+    }
+    std::set<int> inputIndexes;
+    std::set<int> outputIndexes;
+    std::map<std::string, int> inputsMap;
+    std::map<std::string, int> outputsMap;
+    for (int i=0; i<net->tensorName()->size(); ++i) {
+        auto tname = net->tensorName()->GetAsString(i)->str();
+        for (auto& s : inputs) {
+            if (tname == s) {
+                inputIndexes.emplace(i);
+                inputsMap.insert(std::make_pair(s, i));
+                break;
+            }
+        }
+        for (auto& s : outputs) {
+            if (tname == s) {
+                outputIndexes.emplace(i);
+                outputsMap.insert(std::make_pair(s, i));
+                break;
+            }
+        }
+    }
+    std::vector<int> inputIndexesVec(inputs.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        inputIndexesVec[i] = inputsMap[inputs[i]];
+    }
+    std::vector<int> outputIndexesVec(outputs.size());
+    for (int i=0; i<outputs.size(); ++i) {
+        outputIndexesVec[i] = outputsMap[outputs[i]];
+    }
+
+    auto subModulesInfo = _createSubModuleInfo(net, inputIndexes, outputIndexes);
+    std::vector<std::shared_ptr<Module>> subModules(subModulesInfo.size());
+    for (int i=0; i<subModulesInfo.size(); ++i) {
+        subModules[i].reset(_createSubModule(net, subModulesInfo[i], subGraphMap));
+    }
+    auto result = new PipelineModule;
+    /**
+     Compute:
+     std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
+     std::vector<int> mInputIndexes;
+     std::vector<int> mOutputIndexes;
+     int mStackSize = 0;
+     */
+    // Make Stack, first: origin, second: new
+    std::map<int, int> stackMap;
+    int stackIndex = 0;
+    for (auto& m : subModulesInfo) {
+        for (auto index : m.inputs) {
+            if (stackMap.find(index) == stackMap.end()) {
+                stackMap.insert(std::make_pair(index, stackIndex));
+                stackIndex++;
+            }
+        }
+        for (auto index : m.outputs) {
+            if (stackMap.find(index) == stackMap.end()) {
+                stackMap.insert(std::make_pair(index, stackIndex));
+                stackIndex++;
+            }
+        }
+    }
+    result->mStackSize = stackMap.size();
+    for (int i=0; i<subModulesInfo.size(); ++i) {
+        auto& info = subModulesInfo[i];
+        // Reindex stack index
+        std::vector<int> subInputs(info.inputs.size());
+        for (int i=0; i<info.inputs.size(); ++i) {
+            subInputs[i] = stackMap[info.inputs[i]];
+        }
+        std::vector<int> subOutputs(info.outputs.size());
+        for (int i=0; i<info.outputs.size(); ++i) {
+            subOutputs[i] = stackMap[info.outputs[i]];
+        }
+        result->mSubModules.emplace_back(std::make_tuple(subModules[i], subInputs, subOutputs));
+    }
+    for (int i=0; i<inputIndexesVec.size(); ++i) {
+        inputIndexesVec[i] = stackMap[inputIndexesVec[i]];
+    }
+    for (int i=0; i<outputIndexesVec.size(); ++i) {
+        outputIndexesVec[i] = stackMap[outputIndexesVec[i]];
+    }
+    result->mInputIndexes = std::move(inputIndexesVec);
+    result->mOutputIndexes = std::move(outputIndexesVec);
+
+    return result;
+
+}
+
+Module* PipelineModule::clone(CloneContext* ctx) const {
+    PipelineModule* module(new PipelineModule);
+    for (const auto& it : mSubModules) {
+        const std::shared_ptr<Module>& submodule = std::get<0>(it);
+        const std::vector<int>& input_indices = std::get<1>(it);
+        const std::vector<int>& output_indices = std::get<2>(it);
+        std::shared_ptr<Module> replica_submodule(submodule->clone(ctx));
+        module->mSubModules.push_back(
+            std::make_tuple(replica_submodule, input_indices, output_indices));
+        module->registerModel({replica_submodule});
+    }
+    module->mInputIndexes = mInputIndexes;
+    module->mOutputIndexes = mOutputIndexes;
+    module->mStackSize = mStackSize;
+    return this->cloneBaseTo(ctx, module);
+}
+
+} // namespace Express
+} // namespace MNN
--- a/tools/train/source/module/PipelineModule.hpp
+++ b/tools/train/source/module/PipelineModule.hpp
@ -8,16 +8,20 @@

 #ifndef PipelineModule_hpp
 #define PipelineModule_hpp
-#include "Module.hpp"
-#include "NN.hpp"
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/NN.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 namespace MNN {
-namespace Train {
+namespace Express {

 class MNN_PUBLIC PipelineModule : public Module {
 public:
    typedef std::function<std::pair<std::vector<int>, std::shared_ptr<Module>>(Express::EXPRP)> Transformer;
-    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
+    static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
+    static Module* extractOrigin(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain) {
+        return extract(inputs, outputs, fortrain);
+    }
    static bool turnQuantize(Module* module, const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor, NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
    void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
                      NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
@ -26,14 +30,18 @@ public:
    std::vector<int> countOutputReference(std::vector<int> outputIndices);

 private:
+    PipelineModule(){}
    PipelineModule(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs,
                   const Transformer& transformFunction = {});
+
+    Module* clone(CloneContext* ctx) const override;
+
    std::vector<std::tuple<std::shared_ptr<Module>, std::vector<int>, std::vector<int>>> mSubModules;
-    std::vector<Express::VARP> mStack;
    std::vector<int> mInputIndexes;
    std::vector<int> mOutputIndexes;
+    int mStackSize = 0;
 };
-} // namespace Train
+} // namespace Express
 } // namespace MNN

 #endif
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@ -0,0 +1,186 @@
+//
+//  StaticModule.cpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "StaticModule.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include <MNN/AutoTime.hpp>
+#include "core/TensorUtils.hpp"
+#include "core/Session.hpp"
+#include <MNN/expr/Executor.hpp>
+#include <MNN/AutoTime.hpp>
+#include <MNN/expr/ExecutorScope.hpp>
+namespace MNN {
+namespace Express {
+StaticModule::StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix) : mInputs(inputs), mOutputs(outputs) {
+    mShapeFix = shapeFix;
+    mOutputNumbers = (int)outputs.size();
+    /** Compute:
+     std::vector<int, int> mOutputFromTensor;
+     std::vector<int, int> mOutputFromInput;
+     */
+    for (int i=0; i<outputs.size(); ++i) {
+        auto& t = outputs[i];
+        bool fromInput = false;
+        for (int j=0; j<inputs.size(); ++j) {
+            if (inputs[j] == t) {
+                fromInput = true;
+                mOutputFromInput.emplace_back(std::make_pair(i, j));
+                break;
+            }
+        }
+        if (fromInput) {
+            continue;
+        }
+        mOutputFromTensor.emplace_back(i);
+    }
+    if (mOutputFromTensor.empty()) {
+        return;
+    }
+
+    mNet.reset(Interpreter::createFromBuffer(buffer, length));
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    mNet->setSessionMode(Interpreter::Session_Debug);
+#else
+    mNet->setSessionMode(Interpreter::Session_Release);
+#endif
+    if (mShapeFix) {
+        mNet->setSessionMode(Interpreter::Session_Input_Inside);
+    } else {
+        mNet->setSessionMode(Interpreter::Session_Input_User);
+    }
+    auto rt = Express::ExecutorScope::Current()->getRuntime();
+    // TODO: Add Config
+    ScheduleConfig config;
+    config.numThread = 1;
+    config.type = rt.first.begin()->first;
+    config.saveTensors = outputs;
+    mSession = mNet->createSession(config, rt);
+    mInputTensors.resize(inputs.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        mInputTensors[i] = mNet->getSessionInput(mSession, inputs[i].c_str());
+    }
+    mOutputTensors.resize(mOutputFromTensor.size());
+    for (int i=0; i<mOutputFromTensor.size(); ++i) {
+        mOutputTensors[i] = mNet->getSessionOutput(mSession, outputs[mOutputFromTensor[i]].c_str());
+    }
+}
+StaticModule:: ~ StaticModule() {
+    // Do nothing
+}
+std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VARP>& inputs) {
+    AUTOTIME;
+    std::vector<Express::VARP> outputs(mOutputNumbers);
+    for (auto& iter : mOutputFromInput) {
+        outputs[iter.first] = inputs[iter.second];
+    }
+    if (mOutputFromTensor.empty()) {
+        return outputs;
+    }
+    MNN_ASSERT(inputs.size() == mInputTensors.size());
+    for (int i=0; i<inputs.size(); ++i) {
+        auto info = inputs[i]->getInfo();
+        mInputTensors[i]->buffer().type = info->type;
+        auto des = TensorUtils::getDescribe(mInputTensors[i]);
+        if (info->order == Express::NCHW) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NCHW;
+        }
+        if (info->order == Express::NHWC) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NHWC;
+        }
+        if (info->order == Express::NC4HW4) {
+            des->dimensionFormat = MNN_DATA_FORMAT_NC4HW4;
+        }
+        mNet->resizeTensor(mInputTensors[i], info->dim);
+    }
+    if (!mShapeFix) {
+        for (int i=0; i<inputs.size(); ++i) {
+            mInputTensors[i]->buffer().host = (uint8_t*)inputs[i]->readMap<void>();
+        }
+        // FIXME: Use Interpreter's API
+        mSession->setNeedResize();
+    }
+    mNet->resizeSession(mSession);
+    if (mShapeFix) {
+        for (int i=0; i<inputs.size(); ++i) {
+            // For Shape only usage input, don't alloc memory
+            if (nullptr != mInputTensors[i]->host<void>()) {
+                ::memcpy(mInputTensors[i]->host<void>(), inputs[i]->readMap<void>(), mInputTensors[i]->size());
+            }
+        }
+    }
+#ifdef MNN_EXPR_ENABLE_PROFILER
+    auto globalExecutor = ExecutorScope::Current();
+    Timer cost;
+    TensorCallBackWithInfo beforeCallBack = [&cost] (const std::vector<Tensor*>&, const OperatorInfo* info) {
+        cost.reset();
+        return true;
+    };
+    TensorCallBackWithInfo afterCallBack = [&cost, globalExecutor] (const std::vector<Tensor*>&, const OperatorInfo* info) {
+        auto costTimes = (float)cost.durationInUs() / 1000.0f;
+        globalExecutor->addOpCostTime(info->type(), costTimes);
+        globalExecutor->addOpFlops(info->type(), info->flops());
+        return true;
+    };
+    mNet->runSessionWithCallBackInfo(mSession, beforeCallBack, afterCallBack);
+#else
+    mNet->runSession(mSession);
+#endif
+    for (int i=0; i<mOutputTensors.size(); ++i) {
+        Express::Variable::Info info;
+        info.dim = mOutputTensors[i]->shape();
+        info.type = mOutputTensors[i]->getType();
+        auto format = TensorUtils::getDescribe(mOutputTensors[i])->dimensionFormat;
+        info.order = Express::NHWC;
+        if (format == MNN_DATA_FORMAT_NCHW) {
+            info.order = Express::NCHW;
+        } else if (format == MNN_DATA_FORMAT_NC4HW4) {
+            info.order = Express::NC4HW4;
+        }
+        outputs[mOutputFromTensor[i]] = Express::Variable::create(Express::Expr::create(std::move(info), mOutputTensors[i]->host<void>(), Express::VARP::CONSTANT, true), 0);
+        //::memcpy(outputs[i]->writeMap<void>(), mOutputTensors[i]->host<void>(), mOutputTensors[i]->size());
+    }
+    return outputs;
+}
+
+Module* StaticModule::clone(CloneContext* ctx) const {
+    StaticModule* module(new StaticModule);
+    module->mInputs = mInputs;
+    module->mOutputs = mOutputs;
+
+    module->mShapeFix = mShapeFix;
+    module->mOutputNumbers = mOutputNumbers;
+    module->mOutputFromInput = mOutputFromInput;
+    module->mOutputFromTensor = mOutputFromTensor;
+    if (mOutputFromTensor.empty()) {
+        return this->cloneBaseTo(ctx, module);
+    }
+
+    module->mNet = mNet;
+
+    auto rt = Express::ExecutorScope::Current()->getRuntime();
+    ScheduleConfig config;
+    config.numThread = 1;
+    config.type = rt.first.begin()->first;
+    config.saveTensors = mOutputs;
+    module->mSession = module->mNet->createSession(config, rt);
+
+    module->mInputTensors.resize(mInputs.size());
+    module->mOutputTensors.resize(mOutputFromTensor.size());
+    for (int i=0; i<mInputs.size(); ++i) {
+        module->mInputTensors[i] =
+            module->mNet->getSessionInput(module->mSession, mInputs[i].c_str());
+    }
+    for (int i=0; i<mOutputFromTensor.size(); ++i) {
+        module->mOutputTensors[i] = module->mNet->getSessionOutput(
+            module->mSession, mOutputs[mOutputFromTensor[i]].c_str());
+    }
+    return this->cloneBaseTo(ctx, module);
+}
+
+}
+}
--- a/express/module/StaticModule.hpp
+++ b/express/module/StaticModule.hpp
@ -0,0 +1,44 @@
+//
+//  StaticModule.hpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef StaticModule_hpp
+#define StaticModule_hpp
+
+#include <MNN/expr/Module.hpp>
+#include <MNN/Interpreter.hpp>
+namespace MNN {
+namespace Express {
+class StaticModule : public Module {
+public:
+    StaticModule(const void* buffer, size_t length, const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, bool shapeFix = false);
+    virtual ~ StaticModule();
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+
+private:
+    StaticModule() = default;
+
+    Module* clone(CloneContext* ctx) const override;
+
+    std::vector<std::string> mInputs;
+    std::vector<std::string> mOutputs;
+
+    std::shared_ptr<Interpreter> mNet;
+    Session* mSession;
+    std::vector<Tensor*> mInputTensors;
+    std::vector<Tensor*> mOutputTensors;
+    bool mShapeFix;
+    int mOutputNumbers;
+
+    // First: outputIndex, Second: outputTensor Index
+    std::vector<int> mOutputFromTensor;
+    // First: outputIndex, Second: input var index
+    std::vector<std::pair<int, int>> mOutputFromInput;
+};
+}
+}
+#endif
--- a/express/module/WhileModule.cpp
+++ b/express/module/WhileModule.cpp
@ -0,0 +1,186 @@
+//
+//  WhileModule.cpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "WhileModule.hpp"
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNN_generated.h"
+//#define MNN_OPEN_TIME_TRACE
+#include <MNN/AutoTime.hpp>
+namespace MNN {
+namespace Express {
+static int _findPos(const std::vector<std::string>& names, const std::string& key) {
+    for (int i=0; i<names.size(); ++i) {
+        if (names[i] == key) {
+            return i;
+        }
+    }
+    return -1;
+}
+WhileModule* WhileModule::create(const Op* op, const std::map<std::string, SubGraph>& subGraph) {
+    auto module = new WhileModule;
+    auto whileParam = op->main_as_WhileParam();
+    auto& body = subGraph.find(whileParam->body_graph()->str())->second;
+    auto& cond = subGraph.find(whileParam->cond_graph()->str())->second;
+    module->mBody = body.m;
+    module->mCond = cond.m;
+    /** Compute map index
+     int mCondInputNumber;
+     int mBodyInputNumber;
+     
+     // First mCondInputs' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForCond;
+
+     // First mBodyInputs' index, Second: inputs's index
+     std::vector<std::pair<int, int>> mInputForBody;
+     std::vector<int> mOutputFromBody;
+     std::vector<std::pair<int, int>> mUpdateForCond;
+     std::vector<std::pair<int, int>> mUpdateForBody;
+     std::vector<std::pair<int, int>> mCondUpdateForCond;
+     std::vector<std::pair<int, int>> mCondUpdateForBody;
+     */
+    // Map Inputs
+    module->mBodyInputNumber = body.inputs.size();
+    module->mCondInputNumber = cond.inputs.size();
+    for (int i=0; i<whileParam->aliases_inputs()->size(); ++i) {
+        auto index = i;
+        auto data = whileParam->aliases_inputs()->GetAs<StringVec>(i);
+        for (int s=0; s<data->data()->size(); ++s) {
+            auto name = data->data()->GetAsString(s)->str();
+            auto bodyInputPos = _findPos(body.inputs, name);
+            if (bodyInputPos >= 0) {
+                module->mInputForBody.emplace_back(std::make_pair(bodyInputPos, i));
+            }
+            auto condInputPos = _findPos(cond.inputs, name);
+            if (condInputPos >= 0) {
+                module->mInputForCond.emplace_back(std::make_pair(condInputPos, i));
+            }
+        }
+    }
+    // Map update
+    auto update = whileParam->aliases_updates();
+    std::map<int, int> replaceOutputs;
+    for (int i=0; i<update->size(); ++i) {
+        auto data = update->GetAs<StringVec>(i);
+        int bodyInputPos = -1;
+        int condInputPos = -1;
+        int bodyOutputPos = -1;
+        int condOutputPos = -1;
+        MNN_ASSERT(2 == data->data()->size());
+        auto outputName = data->data()->GetAsString(0)->str();
+        auto inputName = data->data()->GetAsString(1)->str();
+        bodyInputPos = _findPos(body.inputs, inputName);
+        condInputPos = _findPos(cond.inputs, inputName);
+        bodyOutputPos = _findPos(body.outputs, outputName);
+        condOutputPos = _findPos(cond.outputs, outputName);
+
+        auto updateBodyOutputPos = _findPos(body.outputs, inputName);
+        
+        MNN_ASSERT(bodyOutputPos == -1 || condOutputPos == -1);
+        if (condOutputPos >= 0) {
+            if (bodyInputPos >= 0) {
+                module->mCondUpdateForBody.emplace_back(std::make_pair(bodyInputPos, condOutputPos));
+            }
+            if (condInputPos >= 0) {
+                module->mCondUpdateForCond.emplace_back(std::make_pair(condInputPos, condOutputPos));
+            }
+        }
+        if (bodyOutputPos >= 0) {
+            if (bodyInputPos >= 0) {
+                module->mUpdateForBody.emplace_back(std::make_pair(bodyInputPos, bodyOutputPos));
+            }
+            if (condInputPos >= 0) {
+                module->mUpdateForCond.emplace_back(std::make_pair(condInputPos, bodyOutputPos));
+            }
+            if (updateBodyOutputPos >= 0) {
+                replaceOutputs.insert(std::make_pair(updateBodyOutputPos, bodyOutputPos));
+            }
+        }
+    }
+    // Map outputs
+    auto output = whileParam->aliases_outputs();
+    for (int i=0; i<output->size(); ++i) {
+        auto data = output->GetAsString(i);
+        auto pos = _findPos(body.outputs, data->str());
+        MNN_ASSERT(pos >= 0);
+        if (replaceOutputs.find(pos) != replaceOutputs.end()) {
+            pos = replaceOutputs[pos];
+        }
+        module->mOutputFromBody.emplace_back(pos);
+    }
+    return module;
+}
+
+std::vector<Express::VARP> WhileModule::onForward(const std::vector<Express::VARP>& inputsI) {
+    std::vector<Express::VARP> condInputs(mCondInputNumber);
+    std::vector<Express::VARP> bodyInputs(mBodyInputNumber);
+    auto& inputs = inputsI;
+    for (auto& p : mInputForCond) {
+        condInputs[p.first] = inputs[p.second];
+    }
+    for (auto& p : mInputForBody) {
+        bodyInputs[p.first] = inputs[p.second];
+    }
+
+    std::vector<Express::VARP> outputs(mOutputFromBody.size());
+    while (true) {
+        auto res = mCond->onForward(condInputs)[0];
+        auto resPtr = res->readMap<int>();
+        if (resPtr[0] <= 0) {
+            break;
+        }
+        auto bodyOutputs = mBody->onForward(bodyInputs);
+        Express::Variable::prepareCompute(bodyOutputs);
+        for (int i=0; i<bodyOutputs.size(); ++i) {
+            auto p = bodyOutputs[i];
+            if (p->expr().first->get() != nullptr) {
+                auto ptr = p->readMap<void>();
+                auto info = p->getInfo();
+                auto newV = Express::_Input(info->dim, info->order, info->type);
+                if (nullptr != ptr) {
+                    ::memcpy(newV->writeMap<void>(), ptr, info->type.bytes() * info->size);
+                }
+                bodyOutputs[i] = newV;
+            }
+        }
+        for (int i=0; i<mOutputFromBody.size(); ++i) {
+            outputs[i] = bodyOutputs[mOutputFromBody[i]];
+        }
+        for (auto& p : mUpdateForCond) {
+            condInputs[p.first] = bodyOutputs[p.second];
+        }
+        for (auto& p : mUpdateForBody) {
+            bodyInputs[p.first] = bodyOutputs[p.second];
+        }
+        for (auto& p : mCondUpdateForCond) {
+            condInputs[p.first] = res;
+        }
+        for (auto& p : mCondUpdateForBody) {
+            bodyInputs[p.first] = res;
+        }
+    }
+    return outputs;
+}
+
+Module* WhileModule::clone(CloneContext* ctx) const {
+    WhileModule* module(new WhileModule);
+    module->mCondInputNumber = mCondInputNumber;
+    module->mBodyInputNumber = mBodyInputNumber;
+    module->mInputForCond = mInputForCond;
+    module->mInputForBody = mInputForBody;
+    module->mOutputFromBody = mOutputFromBody;
+    module->mUpdateForCond = mUpdateForCond;
+    module->mUpdateForBody = mUpdateForBody;
+    module->mCondUpdateForCond = mCondUpdateForCond;
+    module->mCondUpdateForBody = mCondUpdateForBody;
+    module->mCond.reset(mCond->clone(ctx));
+    module->mBody.reset(mBody->clone(ctx));
+    return this->cloneBaseTo(ctx, module);
+}
+
+};
+};
--- a/express/module/WhileModule.hpp
+++ b/express/module/WhileModule.hpp
@ -0,0 +1,46 @@
+//
+//  WhileModule.hpp
+//  MNN
+//
+//  Created by MNN on b'2020/09/10'.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#ifndef WhileModule_hpp
+#define WhileModule_hpp
+#include <MNN/expr/Module.hpp>
+namespace MNN {
+namespace Express {
+class WhileModule : public Module {
+public:
+    virtual ~ WhileModule() {
+        // Do nothing
+    }
+    virtual std::vector<Express::VARP> onForward(const std::vector<Express::VARP>& inputs) override;
+    static WhileModule* create(const Op* op, const std::map<std::string, SubGraph>& subGraph);
+
+private:
+    WhileModule(){}
+
+    Module* clone(CloneContext* ctx) const override;
+
+    int mCondInputNumber;
+    int mBodyInputNumber;
+
+    // First mCondInputs' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForCond;
+
+    // First mBodyInputs' index, Second: inputs's index
+    std::vector<std::pair<int, int>> mInputForBody;
+    std::vector<int> mOutputFromBody;
+    std::vector<std::pair<int, int>> mUpdateForCond;
+    std::vector<std::pair<int, int>> mUpdateForBody;
+
+    std::vector<std::pair<int, int>> mCondUpdateForCond;
+    std::vector<std::pair<int, int>> mCondUpdateForBody;
+
+    std::shared_ptr<Module> mCond;
+    std::shared_ptr<Module> mBody;
+};
+}
+}
+#endif
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@ -11,6 +11,7 @@

 #include <functional>
 #include <map>
+#include <memory>
 #include <string>
 #include <MNN/ErrorCode.hpp>
 #include <MNN/MNNForwardType.h>
@ -67,6 +68,7 @@ class Session;
 struct Content;
 class Tensor;
 class Backend;
+class Runtime;

 class MNN_PUBLIC OperatorInfo {
    struct Info;
@ -89,6 +91,7 @@ protected:

 typedef std::function<bool(const std::vector<Tensor*>&, const std::string& /*opName*/)> TensorCallBack;
 typedef std::function<bool(const std::vector<Tensor*>&, const OperatorInfo*)> TensorCallBackWithInfo;
+typedef std::pair<std::map<MNNForwardType, std::shared_ptr<Runtime>>, std::shared_ptr<Runtime>> RuntimeInfo;

 /** net data holder. multiple sessions could share same net. */
 class MNN_PUBLIC Interpreter {
@ -108,7 +111,43 @@ public:
    static Interpreter* createFromBuffer(const void* buffer, size_t size);
    ~Interpreter();

+    enum SessionMode {
+        /** About CallBack, Default Session_Debug*/
+        /** runSessionWithCallBack is allowed and can get internal op info*/
+        Session_Debug = 0,
+        /** runSessionWithCallBack is not valid and can't get any info of op in session*/
+        Session_Release = 1,
+
+        /** About input tenosr, Default Session_Input_Inside*/
+        /** The input tensor is alloced by session, input data after session resized*/
+        Session_Input_Inside = 2,
+        /** The input tensor is alloced by user, set input data before session resize*/
+        Session_Input_User = 3,
+    };
+    /**
+     * @brief The API shoud be called before create session.
+     * @param mode      session mode
+     * @return void
+     */
+    void setSessionMode(SessionMode mode);
+
+    /**
+     * @brief The API shoud be called before create session.
+     * If the cache exist, try to load cache from file.
+     * After createSession, try to save cache to file.
+     * @param cacheFile      cache file name
+     * @param keySize        the first `keySize` bytes used as the key to check if the `cacheFile` exists.
+     * @return void
+     */
+    void setCacheFile(const char* cacheFile, size_t keySize = 128);
+
 public:
+    /**
+     * @brief create runtimeInfo seperately with schedule config.
+     * @param config session schedule configs.
+     */
+    static RuntimeInfo createRuntime(const std::vector<ScheduleConfig>& configs);
+    
    /**
     * @brief create session with schedule config. created session will be managed in net.
     * @param config session schedule config.
@ -116,6 +155,13 @@ public:
     */
    Session* createSession(const ScheduleConfig& config);

+    /**
+     * @brief create session with schedule config and user-specified runtime.
+     * @param config session schedule config, runtime runtimeInfo used by the created session.
+     * @return created session if success, NULL otherwise.
+     */
+    Session* createSession(const ScheduleConfig& config, const RuntimeInfo& runtime);
+
    /**
     * @brief create multi-path session with schedule configs. created session will be managed in net.
     * @param configs session schedule configs.
@ -123,6 +169,14 @@ public:
     */
    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs);

+    /**
+     * @brief create multi-path session with schedule configs and user-specified runtime.
+              created session will be managed in net.
+     * @param configs session schedule configs.
+     * @return created session if success, NULL otherwise.
+     */
+    Session* createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime);
+
    /**
     * @brief release session.
     * @param session   given session.
@ -204,17 +258,39 @@ public:
     */
    Tensor* getSessionOutput(const Session* session, const char* name);

+    enum SessionInfoCode {
+        /** memory session used in MB, float* */
+        MEMORY = 0,
+
+        /** float operation needed in session in M, float* */
+        FLOPS = 1,
+
+        /** Backends in session in M, int*, length >= the configs when create session */
+        BACKENDS = 2,
+
+        ALL
+    };
+
    /**
-     * @brief get all input tensors.
+     * @brief get session info
     * @param session   given session.
-     * @return all input tensors mapped with name.
+     * @param code      given info code.
+     * @param void*     given info ptr, see SessionInfoCode for detail
+     * @return true if support the code, false otherwise.
     */
-    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
+    bool getSesionInfo(const Session* session, SessionInfoCode code, void* ptr);
+
    /**
     * @brief get all output tensors.
     * @param session   given session.
     * @return all output tensors mapped with name.
     */
+    const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session) const;
+    /**
+     * @brief get all input tensors.
+     * @param session   given session.
+     * @return all input tensors mapped with name.
+     */
    const std::map<std::string, Tensor*>& getSessionInputAll(const Session* session) const;

 public:
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@ -38,13 +38,7 @@
        }                                                        \
    }
 #else
-#define MNN_ASSERT(x)                                            \
-    {                                                            \
-        int res = (x);                                           \
-        if (!res) {                                              \
-            MNN_ERROR("Error for %d\n", __LINE__); \
-        }                                                        \
-    }
+#define MNN_ASSERT(x)
 #endif

 #define FUNC_PRINT(x) MNN_PRINT(#x "=%d in %s, %d \n", x, __func__, __LINE__);
--- a/include/MNN/MNNForwardType.h
+++ b/include/MNN/MNNForwardType.h
@ -23,8 +23,8 @@ typedef enum {
    /*Hand write metal*/
    MNN_FORWARD_METAL = 1,

-    /*Use IOS's MPS instead of hand-write metal, Not Support yet*/
-    MNN_FORWARD_MPS = 2,
+    /*NVIDIA GPU API*/
+    MNN_FORWARD_CUDA = 2,

    /*Android / Common Device GPU API*/
    MNN_FORWARD_OPENCL = 3,
--- a/include/MNN/Tensor.hpp
+++ b/include/MNN/Tensor.hpp
@ -12,6 +12,7 @@
 #include <vector>
 #include <MNN/HalideRuntime.h>
 #include <MNN/MNNDefine.h>
+#define MNN_MAX_TENSOR_DIM 6

 namespace MNN {

--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@ -10,6 +10,7 @@
 #include <MNN/ErrorCode.hpp>
 #include <MNN/expr/Expr.hpp>
 #include <MNN/Tensor.hpp>
+#include <MNN/Interpreter.hpp>
 #include <vector>
 #include <mutex>
 #include <set>
@ -17,41 +18,19 @@
 namespace MNN {
 class Backend;
 class Execution;
+class Runtime;
+struct Op;
 namespace Express {
 class MNN_PUBLIC Executor {
 public:
-    class ComputeCache {
-    public:
-        void setShapeDirty(int offset, Variable::Info* info);
-        void setContentDirty();
-        void setContentReady();
-        void syncInput(int offset, const Variable::Info* info);
-        void syncOutput(int offset, Variable::Info* info);
-
-        struct TensorContent {
-            std::shared_ptr<Tensor> tensor;
-            int refCount = 0;
-            void reset();
-            bool aliveOutside = false;
-        };
+    class ComputeCache;
    struct Unit;
-        virtual ~ ComputeCache() {}
-        ComputeCache() {}
-        virtual ErrorCode compute() = 0;
-        virtual ErrorCode resize() = 0;
-    protected:
-        // Get the index tensor with the need of needBackend
-        // If the Tensor don't belong to the backend, need use needBackend to alloc it and return
-        virtual Tensor* getTensor(int index, bool host) = 0;
-        void _setShapeDirty();
-        friend class Executor;
-        bool mContentDirty = true;
-        bool mShapeDirty = true;
-    };
+    static void setShapeDirty(ComputeCache* cache);
+    static void setContentDirty(ComputeCache* cache);
+    static void* mapOutput(ComputeCache* cache, int offset, Tensor* dest);
    struct Requirement {
        std::vector<bool> contentNeedContent;
        std::vector<bool> shapeNeedContent;
-        std::vector<bool> supportError;
    };
    ~Executor();
    Requirement getRequirement(Expr* expr) const;
@ -65,25 +44,27 @@ public:
    };
    void gc(GCFlag flag = FULL);
    static std::shared_ptr<Executor> getGlobalExecutor();
+
+    static std::shared_ptr<Executor> newExecutor(MNNForwardType type,
+                                                 const BackendConfig& config,
+                                                 int numberThread);
    void resetProfile();
    void dumpProfile();
    void addOpCostTime(int op, float costTime);
+    void addOpCostTime(const std::string& type, float costTime);
+    void addOpFlops(const std::string& type, float flops);
    class Profiler;
+    static RuntimeInfo getRuntime();
 private:
-    void _createSingle(EXPRP expr);
-    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::vector<ComputeCache::TensorContent>&& tensors, bool forceCPU);
+    void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
+    void _create(const std::vector<EXPRP>& outputs, std::set<std::shared_ptr<Executor::ComputeCache>>&& inputCaches, std::set<std::shared_ptr<Expr::Inside>>&& inputNode, bool forceCPU);

-    void _addToCache(const std::vector<std::shared_ptr<ComputeCache>>& caches);
-    void _resetCache();
-    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::vector<ComputeCache::TensorContent>& tensors);
+    void _visit(EXPRP expr, std::set<std::shared_ptr<Executor::ComputeCache>>& inputCaches, std::set<std::shared_ptr<Expr::Inside>>& inputNode);

-    Executor(std::shared_ptr<Backend> backend);
-    std::shared_ptr<Backend> mBackend;
-    std::shared_ptr<Backend> mBackupBackend;
+    Executor(std::shared_ptr<Runtime> backend, MNNForwardType type);
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mRuntime;
+    std::pair<std::shared_ptr<Runtime>, MNNForwardType> mBackupRuntime;
    std::mutex mMutex;
-    std::vector<std::shared_ptr<Tensor>> mStack;
-    std::vector<Tensor*> mStackInputs;
-    std::vector<Tensor*> mStackOutputs;
    std::shared_ptr<Profiler> mProfiler;
 };
 } // namespace Express
--- a/include/MNN/expr/ExecutorScope.hpp
+++ b/include/MNN/expr/ExecutorScope.hpp
@ -0,0 +1,33 @@
+//
+//  ExecutorScope.hpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_EXPR_EXECUTOR_SCOPE_HPP_
+#define MNN_EXPR_EXECUTOR_SCOPE_HPP_
+
+#include <MNN/expr/Executor.hpp>
+
+namespace MNN {
+namespace Express {
+
+struct ExecutorScope final {
+public:
+    ExecutorScope() = delete;
+    explicit ExecutorScope(const ExecutorScope&) = delete;
+    explicit ExecutorScope(const std::shared_ptr<Executor>& current);
+
+    explicit ExecutorScope(const std::string& scope_name,
+                           const std::shared_ptr<Executor>& current);
+
+    virtual ~ExecutorScope();
+
+    static const std::shared_ptr<Executor> Current();
+};
+
+}  // namespace MNN
+}  // namespace Express
+#endif  // MNN_EXPR_EXECUTOR_SCOPE_HPP_
--- a/include/MNN/expr/Expr.hpp
+++ b/include/MNN/expr/Expr.hpp
@ -87,6 +87,7 @@ public:
    };
    bool fix(InputType type) const;
 private:
+    friend class Variable;
    std::shared_ptr<Variable> mContent;
 };
 inline bool operator==(Variable* src, VARP dst) {
@ -107,7 +108,6 @@ public:
        INTS dim;
        halide_type_t type;
        int size;
-        void* ptr = nullptr;
        void syncSize();
    };
    const std::string& name() const;
@ -173,7 +173,7 @@ private:
 class MNN_PUBLIC Expr {
 public:
    struct Inside;
-    static EXPRP create(Variable::Info&& info);
+    static EXPRP create(Variable::Info&& info, const void* ptr, VARP::InputType type, bool copy = true);
    static EXPRP create(const OpT* op, std::vector<VARP> inputs, int outputSize = 1);
    static EXPRP create(std::pair<std::shared_ptr<char>, int> extra, std::vector<VARP>&& inputs, int outputSize = 1);
    static EXPRP create(std::unique_ptr<OpT>&& op, std::vector<VARP> inputs, int outputSize = 1) {
@ -188,7 +188,7 @@ public:
        return mInputs;
    }
    int outputSize() const {
-        return mOutputNames.size();
+        return (int)mOutputNames.size();
    }
    static void replace(EXPRP oldExpr, EXPRP newExpr);
    bool requireInfo();
--- a/tools/train/source/module/Module.hpp
+++ b/tools/train/source/module/Module.hpp
@ -8,9 +8,14 @@

 #ifndef MNN_Train_Module_hpp
 #define MNN_Train_Module_hpp
+
+#include <vector>
+#include <unordered_map>
+
 #include <MNN/expr/Expr.hpp>
+
 namespace MNN {
-namespace Train {
+namespace Express {
 class MNN_PUBLIC Module {
 public:
    Module()                                                                               = default;
@ -21,9 +26,6 @@ public:
    bool loadParameters(const std::vector<Express::VARP>& parameters);
    void setIsTraining(const bool isTraining);
    bool getIsTraining();
-    static std::shared_ptr<Module> transform(const std::vector<Express::VARP>& inputs,
-                                             const std::vector<Express::VARP>& outputs);
-
    void clearCache();

    const std::string& name() const {
@ -38,12 +40,45 @@ public:
    void setType(std::string type) {
        mType = std::move(type);
    }
+    // Return the parameter index
+    int addParameter(Express::VARP parameter);
+
+    void setParameter(Express::VARP parameter, int index);
+    static Module* createEmpty(const std::vector<Express::VARP>& parameters);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, bool dynamic = false);
+    static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, bool dynamic = false);
+
+    static Module* clone(const Module* module, const bool shareParams = false);
+
+    class CloneContext {
+    public:
+        CloneContext() = default;
+        explicit CloneContext(const bool shareParams)
+            : mShareParams(shareParams) {}
+        virtual ~CloneContext() = default;
+
+        const bool shareParams() const { return mShareParams; }
+
+        EXPRP getOrClone(const EXPRP expr);
+        VARP getOrClone(const VARP var);
+
+    private:
+        bool mShareParams = false;
+        std::unordered_map<const Expr*, EXPRP> mExprMap;
+        std::unordered_map<const Variable*, VARP> mVarMap;
+    };
+
+    virtual Module* clone(CloneContext* ctx) const {
+        return nullptr;
+    }
+
 protected:
    void registerModel(const std::vector<std::shared_ptr<Module>>& children);
-    void addParameter(Express::VARP parameter);
    virtual void onClearCache() {
    }

+    Module* cloneBaseTo(CloneContext* ctx, Module* module) const;
+
 private:
    void _collectParameters(std::vector<Express::VARP>& result) const;
    std::vector<std::shared_ptr<Module>> mChildren;
@ -52,6 +87,13 @@ private:
    std::string mName;
    std::string mType;
 };
+
+struct SubGraph {
+    std::vector<std::string> inputs;
+    std::vector<std::string> outputs;
+    std::shared_ptr<Module> m;
+};
+
 } // namespace Train
 } // namespace MNN

--- a/tools/train/source/module/NN.hpp
+++ b/tools/train/source/module/NN.hpp
@ -9,11 +9,10 @@
 #ifndef MNN_Train_NN_hpp
 #define MNN_Train_NN_hpp
 #include <MNN/expr/ExprCreator.hpp>
-#include "Distributions.hpp"
-#include "Module.hpp"
+#include <MNN/expr/Module.hpp>
 #include <vector>
 namespace MNN {
-namespace Train {
+namespace Express {
 class Initializer;

 class MNN_PUBLIC NN {
@ -29,7 +28,7 @@ public:
    };
    enum FeatureScaleStatMethod {
        PerTensor = 0,
-        PerChannel = 1
+        PerChannel = 1 // Depercerate
    };
    /* Unlike enum in class, class in class need be dllimport or dllexport explcility.
       Compiling in other system will not be affected.
@ -86,7 +85,7 @@ public:
        static ConvParameters ExtractConvolution(Express::EXPRP expr);

        // Extract BatchNormal and Dropout
-        static Module* ExtractNotRunableOp(Express::EXPRP expr);
+        static Module* ExtractNotRunableOp(Express::EXPRP expr, const std::map<std::string, SubGraph>& subgraphs);
    };
 };

--- a/include/MNN/expr/NeuralNetWorkOp.hpp
+++ b/include/MNN/expr/NeuralNetWorkOp.hpp
@ -31,25 +31,30 @@ MNN_PUBLIC VARP _Const(const void* ptr, INTS shape = {}, Dimensionformat format
 MNN_PUBLIC VARP _TrainableParam(float value, INTS dims, Dimensionformat format);
 MNN_PUBLIC VARP _TrainableParam(const void* ptr, INTS dims, Dimensionformat format,
                                  halide_type_t type = halide_type_of<float>());
+MNN_PUBLIC VARP _InnerProduct(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS outputShape);
 MNN_PUBLIC VARP _Conv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                      INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});

 MNN_PUBLIC VARP _Conv(float weight, float bias, VARP x, INTS channel, INTS kernelSize, PaddingMode pad = VALID,
                      INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1);
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
+                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false, int nbits = 8);
 MNN_PUBLIC VARP _Conv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
                      PaddingMode pad = VALID, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
 MNN_PUBLIC VARP _Deconv(VARP weight, VARP bias, VARP x, PaddingMode pad = VALID, INTS stride = {1, 1},
                                INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0});
+
+MNN_PUBLIC VARP _Deconv(std::vector<float>&& weight, std::vector<float>&& bias, VARP x, INTS channel, INTS kernelSize,
+PaddingMode pad, INTS stride = {1, 1}, INTS dilate = {1, 1}, int group = 1, INTS pads = {0, 0}, bool relu = false, bool relu6 = false);
+
 MNN_PUBLIC VARP _MaxPool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
 MNN_PUBLIC VARP _AvePool(VARP x, INTS kernel, INTS stride = {1, 1}, PaddingMode pad = VALID, INTS pads= {0, 0});
-MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NHWC);
+MNN_PUBLIC VARP _Reshape(VARP x, INTS shape, Dimensionformat original_format = NCHW);
 MNN_PUBLIC VARP _Reshape(VARP x, VARP shape);
 MNN_PUBLIC VARP _Scale(VARP x, int channels, std::vector<float>&& scales, std::vector<float>&& bias);

 MNN_PUBLIC VARP _Relu(VARP x, float slope = 0.0f);
-MNN_PUBLIC VARP _Relu6(VARP x);
+MNN_PUBLIC VARP _Relu6(VARP x, float minValue = 0.0f, float maxValue = 6.0f);
 MNN_PUBLIC VARP _PRelu(VARP x, std::vector<float> &&slopes);
 MNN_PUBLIC VARP _Softmax(VARP logits, int axis = -1);
 MNN_PUBLIC VARP _Softplus(VARP features);
@ -76,7 +81,7 @@ MNN_PUBLIC VARP _Pad(VARP x, VARP paddings, PadValueMode mode = CONSTANT);
 MNN_PUBLIC VARP _ExpandDims(VARP input, int axis);
 MNN_PUBLIC VARP _ExpandDims(VARP input, VARP axis);

-MNN_PUBLIC VARP _Shape(VARP input);
+MNN_PUBLIC VARP _Shape(VARP input, bool nchw = false);
 MNN_PUBLIC VARP _Stack(VARPS values, int axis=0);
 enum InterpolationMethod {BILINEAR, NEAREST};
 MNN_PUBLIC VARP _CropAndResize(VARP image, VARP boxes, VARP box_ind, VARP crop_size, 
@ -92,6 +97,7 @@ MNN_PUBLIC VARP _GatherND(VARP params, VARP indices);
 MNN_PUBLIC VARP _Selu(VARP features, float scale, float alpha);
 MNN_PUBLIC VARP _Size(VARP input);
 MNN_PUBLIC VARP _Elu(VARP features, float alpha=1.0);
+MNN_PUBLIC VARP _Threshold(VARP features, float alpha=1.0);
 MNN_PUBLIC VARP _MatrixBandPart(VARP input, VARP num_lower, VARP num_upper);
 MNN_PUBLIC std::vector<VARP> _Moments(VARP x, INTS axis, VARP shift, bool keepDims);
 MNN_PUBLIC VARP _SetDiff1D(VARP x, VARP y); 
@ -123,7 +129,8 @@ MNN_PUBLIC VARP _ZeroGrad(VARP x);

 // Int8 Inference
 MNN_PUBLIC VARP _Conv(std::vector<int8_t>&& weight, std::vector<int>&& bias, std::vector<float>&& scale, VARP x, INTS channel, INTS kernelSize,
-                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu);
+                      PaddingMode pad, INTS stride, INTS dilate, int group, INTS pads, bool relu, int nbits = 8);
+MNN_PUBLIC VARP _CosineSimilarity(VARP input0, VARP input1, VARP inputDim);
 MNN_PUBLIC VARP _FloatToInt8(VARP x, VARP scale, char minValue, char maxValue);
 MNN_PUBLIC VARP _Int8ToFloat(VARP x, VARP scale);

--- a/include/MNN/expr/Scope.hpp
+++ b/include/MNN/expr/Scope.hpp
@ -0,0 +1,102 @@
+//
+//  RuntimeScope.hpp
+//  MNN
+//
+//  Created by MNN on 2020/10/26.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_EXPR_SCOPE_HPP_
+#define MNN_EXPR_SCOPE_HPP_
+
+#include <cstdio>
+#include <vector>
+#include <string>
+#include <mutex>
+
+#include <MNN/Interpreter.hpp>
+
+namespace MNN {
+namespace Express {
+
+template <typename T>
+class Scope {
+public:
+    Scope();
+    virtual ~Scope() = default;
+
+    struct ScopedContent {
+        std::string scope_name;
+        T content;
+    };
+    void EnterScope(const ScopedContent& current);
+    void EnterScope(const T& current);
+    void EnterScope(const std::string& scope_name, const T& current);
+
+    void ExitScope();
+
+    const ScopedContent& Current() const;
+
+    int ScopedLevel() const { return scoped_level_; }
+
+private:
+    std::string MakeScopeName(const std::string& prefix, int level) const;
+
+    mutable std::mutex mutex_;
+    int scoped_level_ = 0;
+    std::vector<ScopedContent> scoped_contents_;
+};
+
+template <typename T>
+Scope<T>::Scope() : scoped_level_(0) {
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const ScopedContent& current) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    ++scoped_level_;
+    scoped_contents_.push_back(current);
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const T& current) {
+    EnterScope("scope", current);
+}
+
+template <typename T>
+void Scope<T>::EnterScope(const std::string& scope_name,
+                          const T& current) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    int scoped_level = ScopedLevel();
+    std::string name = MakeScopeName(scope_name, scoped_level++);
+    ScopedContent content{name, current};
+    ++scoped_level_;
+    scoped_contents_.push_back(content);
+}
+
+template <typename T>
+void Scope<T>::ExitScope() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    --scoped_level_;
+    scoped_contents_.resize(scoped_level_);
+}
+
+template <typename T>
+const typename Scope<T>::ScopedContent& Scope<T>::Current() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    MNN_CHECK(scoped_contents_.size() > 0, "Scope level should not be 0.");
+    return scoped_contents_.back();
+}
+
+template <typename T>
+std::string Scope<T>::MakeScopeName(const std::string& prefix,
+                                    int level) const {
+    char s[16];
+    snprintf(s, 16, "%d", level);
+    return prefix + "/" + std::string(s);
+}
+
+}  // namespace Express
+}  // namespace MNN
+
+#endif  // MNN_EXPR_SCOPE_HPP_
--- a/package_scripts/win_package.ps1
+++ b/package_scripts/win_package.ps1
@ -1,12 +1,14 @@
-#   MNN_Windows
-#     |------- MNN_Windows_lib
-#                   |---------- Dynamic_Library
-#                   |---------- Static_Library
-#     |------- MNN_Windows_tools
+# MNN
+#  |-- Debug
+#  |     |--- MD
+#  |     |--- MT
+#  |-- Release
+#        |--- MD
+#        |--- MT

 $erroractionpreference = "stop"

-Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN_Windows"
+Set-Variable -Name WINDOWS_PACKAGE_NAME -Value "MNN"

 #clear and create package directory
 powershell ./schema/generate.ps1
@ -14,32 +16,50 @@ Set-Variable -Name WINDOWS_PACKAGE_PATH -Value "$(pwd)\$WINDOWS_PACKAGE_NAME"
 Remove-Item $WINDOWS_PACKAGE_PATH -Recurse -ErrorAction Ignore
 mkdir $WINDOWS_PACKAGE_PATH\
 cd $WINDOWS_PACKAGE_PATH
-mkdir -p MNN_Windows_lib\Dynamic_Library
-mkdir -p MNN_Windows_lib\Static_Library
-mkdir MNN_Windows_tools
+mkdir -p Debug\MD
+mkdir -p Debug\MT
+mkdir -p Release\MD
+mkdir -p Release\MT
 cd ..

 Remove-Item build -Recurse -ErrorAction Ignore
 mkdir build
-cd build
+pushd build
 # tools without dependency, static library without sep_build
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
+#cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_BUILD_CONVERTER=ON -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_DEMO=ON -DMNN_BUILD_QUANTOOLS=ON -DMNN_EVALUATION=ON ..
+#ninja
+#pushd $WINDOWS_PACKAGE_PATH
+#cp ..\build\*.exe MNN_Windows_tools
+#cp ..\build\*.pdb MNN_Windows_tools
+#cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
+#popd
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
 ninja
-pushd $WINDOWS_PACKAGE_PATH
-cp ..\build\*.exe MNN_Windows_tools
-cp ..\build\*.pdb MNN_Windows_tools
-cp ..\build\MNN.lib MNN_Windows_lib\Static_Library
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MT
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MT
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MT
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Debug -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Debug\MD
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Debug\MD
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Debug\MD
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=ON -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MT
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MT
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MT
+
+Remove-Item CMakeCache.txt -ErrorAction Ignore
+cmake -G "Ninja" -DMNN_SEP_BUILD=OFF -DCMAKE_BUILD_TYPE=Release -DMNN_WIN_RUNTIME_MT=OFF -DMNN_OPENCL=ON ..
+ninja
+cp MNN.lib $WINDOWS_PACKAGE_PATH\Release\MD
+cp MNN.dll $WINDOWS_PACKAGE_PATH\Release\MD
+cp MNN.pdb $WINDOWS_PACKAGE_PATH\Release\MD
+
 popd
-
-#dynamic library without sep_build
-rm .\CMakeCache.txt
-cmake -G "Ninja" -DMNN_SEP_BUILD=OFF ..
-ninja
-cd $WINDOWS_PACKAGE_PATH
-cp ..\build\MNN.lib MNN_Windows_lib\Dynamic_Library
-cp ..\build\MNN.dll MNN_Windows_lib\Dynamic_Library
-cp ..\build\MNN.pdb MNN_Windows_lib\Dynamic_Library
-
-# Compress MNN_Windows_lib and MNN_Windows_tools
-Compress-Archive -Path MNN_Windows_lib -DestinationPath MNN_Windows_lib.zip -Update -CompressionLevel Optimal
-Compress-Archive -Path MNN_Windows_tools -DestinationPath MNN_Windows_tools.zip -Update -CompressionLevel Optimal
--- a/project/android/CMakeExports.txt
+++ b/project/android/CMakeExports.txt
@ -8,15 +8,14 @@ set_target_properties(
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN.so
                )

-add_library( MNN_Arm82 SHARED IMPORTED GLOBAL)
-set_target_properties(
-                MNN_Arm82
-                PROPERTIES IMPORTED_LOCATION
-                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Arm82.so
-                )
-
 add_library( MNN_CL SHARED IMPORTED GLOBAL )
 set_target_properties( MNN_CL
                PROPERTIES IMPORTED_LOCATION
                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_CL.so
                )
+
+add_library( MNN_Express SHARED IMPORTED GLOBAL )
+set_target_properties( MNN_Express
+                PROPERTIES IMPORTED_LOCATION
+                ${CMAKE_CURRENT_LIST_DIR}/libs/${ANDROID_ABI}/libMNN_Express.so
+                )
--- a/project/android/updateTest.sh
+++ b/project/android/updateTest.sh
@ -5,7 +5,6 @@ adb push ./libMNN_CL.so /data/local/tmp/MNN/libMNN_CL.so
 adb push ./libMNN_Vulkan.so /data/local/tmp/MNN/libMNN_Vulkan.so
 adb push ./libMNN_GL.so /data/local/tmp/MNN/libMNN_GL.so
 adb push ./libMNN_Express.so /data/local/tmp/MNN/libMNN_Express.so
-adb push ./libMNN_Arm82.so /data/local/tmp/MNN/libMNN_Arm82.so
 adb push ./MNNV2Basic.out /data/local/tmp/MNN/MNNV2Basic.out
 adb shell "cd /data/local/tmp/MNN && rm -r output"
 adb shell "cd /data/local/tmp/MNN && mkdir output"
@ -18,3 +17,4 @@ adb push ./timeProfile.out /data/local/tmp/MNN/timeProfile.out
 adb push ./train.out /data/local/tmp/MNN/train.out
 adb push ./benchmark.out /data/local/tmp/MNN/benchmark.out
 adb push ./benchmarkExprModels.out /data/local/tmp/MNN/benchmarkExprModels.out
+adb push ./run_test.out /data/local/tmp/MNN/run_test.out
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
--- a/project/ios/MNN/Info.plist
+++ b/project/ios/MNN/Info.plist
@ -4,6 +4,8 @@
 <dict>
 	<key>CFBundleDevelopmentRegion</key>
 	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>
--- a/project/ios/MNN/OpRegister.sh
+++ b/project/ios/MNN/OpRegister.sh
@ -1,57 +0,0 @@
-#!bin/sh
-
-echo "Register Op Begin"
-
-function read_dir(){
-	str1=`grep -e $2 $1/*.$4|sed s/[[:space:]]//g`
-	array=(${str1//\;/ })
-	for var in ${array[@]}; do
-	    `echo $var|awk -F $3 '{
-			a="___";
-			b="__();";
-			c="extern void ";
-			print(c""a""$3"__"$4""b) >> "extern";
-			print (a""$3"__"$4""b) >> "call"
-		}'`
-	done
-}
-
-start=$(date +%s)
-
-SEP='[:(,)]'
-FILE_EXTERN_CPP='cpp'
-FILE_EXTERN_MM='mm'
-
-SHELL_FOLDER=$(dirname $0)'/../../..'
-# handle CPU
-CPUFILE=$SHELL_FOLDER/source/backend/cpu/CPUOPRegister.cpp
-echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $CPUFILE
-echo "Start Register CPU"
-CPU=$SHELL_FOLDER/source/backend/cpu
-CPU_KEY='REGISTER_CPU_OP_CREATOR'
-read_dir $CPU $CPU_KEY $SEP $FILE_EXTERN_CPP
-cat extern >> $CPUFILE
-rm extern
-echo '\nvoid registerCPUOps() {' >> $CPUFILE
-cat call >> $CPUFILE
-echo '}\n#endif\n}' >> $CPUFILE
-rm call
-
-# handle Shape
-echo "Start Register Shape"
-SHAPEFILE=$SHELL_FOLDER/source/shape/ShapeRegister.cpp
-SHAPE=$SHELL_FOLDER/source/shape
-SHAPE_KEY="REGISTER_SHAPE"
-echo "// This file is generated by Shell for ops register\nnamespace MNN {\n#ifdef MNN_CODEGEN_REGISTER" > $SHAPEFILE
-read_dir $SHAPE $SHAPE_KEY $SEP $FILE_EXTERN_CPP
-cat extern >> $SHAPEFILE
-rm extern
-echo '\nvoid registerShapeOps() {' >> $SHAPEFILE
-cat call >> $SHAPEFILE
-echo '}\n#endif\n}' >> $SHAPEFILE
-rm call
-
-echo "Register Op End"
-
-dur=$(echo "$(date +%s) - $start" | bc)
-printf "Execution time: %.6f seconds" $dur
--- a/project/ios/Playground/AppDelegate.mm
+++ b/project/ios/Playground/AppDelegate.mm
@ -8,10 +8,14 @@

 #import "AppDelegate.h"
 #import "MNNTestSuite.h"
+#import <MNN/expr/Executor.hpp>

@implementation AppDelegate

 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    MNN::BackendConfig config;
+    // If want to test metal, change MNN_FORWARD_CPU to MNN_FORWARD_METAL
+    MNN::Express::Executor::getGlobalExecutor()->setGlobalExecutorConfig(MNN_FORWARD_CPU, config, 1);
    MNNTestSuite::runAll();
    return YES;
 }
--- a/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
+++ b/pymnn/examples/MNNEngineDemo/mobilenet_demo.py
@ -8,6 +8,9 @@ import cv2
 def inference():
    """ inference mobilenet_v1 using a specific picture """
    interpreter = MNN.Interpreter("mobilenet_v1.mnn")
+    interpreter.setCacheFile('.tempcache')
+    config = {}
+    config['precision'] = 'low'
    session = interpreter.createSession()
    input_tensor = interpreter.getSessionInput(session)
    image = cv2.imread('ILSVRC2012_val_00049999.JPEG')
--- a/pymnn/examples/MNNTrain/mnist/train_mnist.py
+++ b/pymnn/examples/MNNTrain/mnist/train_mnist.py
@ -96,8 +96,7 @@ def demo():
    train_dataloader = MNN.data.DataLoader(train_dataset, batch_size = 64, shuffle = True)
    test_dataloader = MNN.data.DataLoader(test_dataset, batch_size = 100, shuffle = False)

-    opt = MNN.optim.SGD(0.01, 0.9, 0.0005)
-    opt.append(model.parameters)
+    opt = MNN.optim.SGD(model, 0.01, 0.9, 0.0005)

    F.set_thread_number(4)

--- a/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
+++ b/pymnn/examples/MNNTrain/mobilenet_finetune/mobilenet_transfer.py
@ -125,8 +125,7 @@ def demo():

    net = Net(feature_extractor, num_classes)

-    opt = MNN.optim.SGD(1e-3, 0.9, 0.00004)
-    opt.append(net.parameters)
+    opt = MNN.optim.SGD(net, 1e-3, 0.9, 0.00004)

    for epoch in range(10):
        train_func(net, train_dataloader, opt, num_classes)
--- a/pymnn/examples/MNNTrain/module_save/grad_test.py
+++ b/pymnn/examples/MNNTrain/module_save/grad_test.py
@ -0,0 +1,15 @@
+import numpy as np
+import MNN
+nn = MNN.nn
+F = MNN.expr
+
+v0 = F.const([0.3,0.1, -0.3,0.4], [4])
+v2 = F.const([0.3,0.1, -0.3,0.4], [4])
+v1 = v0 * v0
+
+outputDiff = F.const([0.05, 0.03, 0.02, 0.01], [4])
+
+v0Grad = nn.grad(v1, [v0, v2], [outputDiff], "")
+print(v0Grad)
+print(v0Grad[0].read())
+F.save(v0Grad, "temp.grad")
--- a/pymnn/examples/MNNTrain/module_save/test_save.py
+++ b/pymnn/examples/MNNTrain/module_save/test_save.py
@ -0,0 +1,36 @@
+import numpy as np
+import MNN
+nn = MNN.nn
+F = MNN.expr
+
+class Net(nn.Module):
+    """construct a lenet 5 model"""
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.conv(1, 20, [5, 5])
+        self.conv2 = nn.conv(20, 50, [5, 5])
+        self.fc1 = nn.linear(800, 500)
+        self.fc2 = nn.linear(500, 10)
+        self.step = F.const([10], [], F.NCHW, F.int)
+        self.lr = F.const([0.0004],[], F.NCHW, F.float)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool(x, [2, 2], [2, 2])
+        x = F.relu(self.conv2(x))
+        x = F.max_pool(x, [2, 2], [2, 2])
+        x = F.reshape(x, [0, -1])
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        x = F.softmax(x, 1)
+        return x
+
+
+model = Net()
+F.save(model.parameters, 'mnist.snapshot')
+
+
+model2 = Net()
+model2.load_parameters(F.load_as_list('mnist.snapshot'))
+
+print(model2.lr.read())
+print(model2.step.read())
--- a/Show More
+++ b/Show More